o
    ~b                     @   s   d dl mZ d dlm  mZ d dlZd dlZee	dZ
e
d d ede
d d  d e
d d	 ede
d d	  d iZd
edefddZdejfddZdd ZdejfddZdee dejdefddZdedejfddZdS )    )ListNzflibl_config.json	languagesmain_languagez([^Zvalid_charactersz])child_languagephraselgc                 C   sr   | r7dd dd t | | D D }g }d}|D ]}|dkr$||7 }q|r/|d|  d}|| q|S g S )zTokenize an utterance based on specified word-forming characters.

    Parameters:
        phrase: the string to be tokenized
        lg: the language whose word-forming characters are to be used in tokenization
    
    Return list of tokens
    c                 S   s   g | ]}|r|qS  r   ).0jr   r   M/Users/pat/Sites/forum-docling/projects/sunny-flibl/flexible-main/flexible.py
<listcomp>       ztokenize.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )stripr	   ir   r   r   r      r    . )word_formingsplitappend)r   r   ZtokensZcollected_tokensZpunct_charstokenr   r   r   tokenize   s   	"
r   elc              
   C   s(   t d| jdt| jd| jdt|  dS )zPrint the tag, attributes, text, and number of children of an ET.Element
    
    Parameters:
        el: Element to be printed
    zTag:z
Attrs:z
Text:z
No. of Children:N)printtagstrattribtextlen)r   r   r   r   print_el_info)   s   (r    c                  C   s   t tddd d } t| dd }ddt|  | }|dd	 d
 |d	d  d
 |dd  d
 |dd  d
 |dd  }tdddt|  |S )zGenerate FLEx guid based on offset defined in offset.txt

    Increments offset upon use.

    Return guid in format [0-f]{8}-([0-f]{4}-){3}[0-f]{12}
    z
offset.txtr)mode      N0       -         w)intopenreadhexr   writer   )Zglobal_offsetZnew_guid_numZnew_guid_strZnew_guidr   r   r   generate_guid1   s   Lr2   eaf_rootc                 C   s   dd |  dD S )zGet time IDs and values from an EAF file
    
    Parameters:
        eaf_root: is the root element of an EAF object parsed through ElementTree

    Return a dictionary of time ID and value pairs
    c                 S   s   i | ]}|j d  |j d qS )ZTIME_SLOT_IDZ
TIME_VALUEr   r   r   r   r   
<dictcomp>H   s    ztime_values.<locals>.<dictcomp>z.//TIME_SLOT)findall)r3   r   r   r   time_values@   s   r7   tokenized_utt	phrase_elc           	      C   s   t d}t| }| D ]-}t jddt id}||rd}nd}t jd||dd}||_|| || q|| d	S )
a  Populate a phrase element with a tokenized utterance

    *tokenized_utt* is a list with tokens from an utterance
    *phrase_el* is the element that will be the parent to the words added (below the words el in the phrase_el will be the items w/ translations and notes)
    *lg* is the language whose word-forming characters are to be used in tokenization
    
    Makes changes in place (returns nothing)
    wordswordguidr4   punctZtxtitem)typelangN)ETElementr   r2   searchr   r   )	r8   r9   r   r:   Zutterance_word_formingr   r;   r?   Ztoken_elr   r   r   add_word_elL   s   
	

rD   
constrainteafc                 C   s@   | d| D ]}| d|jd D ]}|| qqdS )zRemove the tiers that use the "Included In" stereotype constraint

    Parameters:
        eaf: The root element of an EAF file
    
    Makes changes in place (returns nothing)    
    z.//*[@CONSTRAINTS='{}']z.//*[@LINGUISTIC_TYPE_REF='{}']ZLINGUISTIC_TYPE_IDN)r6   formatr   remove)rE   rF   Zconstraint_tierZbad_tierr   r   r   remove_constrainte   s
   rI   )typingr   xml.etree.ElementTreeetreeElementTreerA   jsonreloadr.   configcompiler   r   r   rB   r    r2   r7   rD   rI   r   r   r   r   <module>   s   ""