B
    b                 @   s   d dl mZ d dlZd dlm  mZ eedddZej	dddZ
d	d
 Zej	dddZee ej	edddZej	dddZdS )    )ListN)phraselgc             C   sV   |dkrt d}n|dkr(t d}ntd| rRdd dd || D D S g S )	zTokenize an utterance based on specified word-forming characters.

    Parameters:
        phrase: the string to be tokenized
        lg: the language whose word-forming characters are to be used in tokenization
    
    Return list of tokens
    tcaz([^0-9A-Za-z\+#_])mtou"   ([^A-Za-zäëöü'`ꞌꞋ'‘’])z/Language either not given or an invalid string.c             S   s   g | ]}|r|qS  r   ).0jr   r   //home/sunny/Documents/lx/flexible/flexible02.py
<listcomp>   s    ztokenize.<locals>.<listcomp>c             S   s   g | ]}|  qS r   )strip)r   ir   r   r
   r      s    )recompile
ValueErrorsplit)r   r   Znon_word_formingr   r   r
   tokenize   s    
r   )elc          
   C   s(   t d| jdt| jd| jdt|  dS )zPrint the tag, attributes, text, and number of children of an ET.Element
    
    Parameters:
        el: Element to be printed
    zTag:z
Attrs:z
Text:z
No. of Children:N)printtagstrattribtextlen)r   r   r   r
   print_el_info   s    r   c              C   s   t tddd d } t| dd }ddt|  | }|dd	 d
 |d	d  d
 |dd  d
 |dd  d
 |dd  }tdddt|  |S )zGenerate FLEx guid based on offset defined in offset.txt

    Increments offset upon use.

    Return guid in format [0-f]{8}-([0-f]{4}-){3}[0-f]{12}
    z
offset.txtr)mode      N0       -         w)intopenreadhexr   writer   )Zglobal_offsetZnew_guid_numZnew_guid_strZnew_guidr   r   r
   generate_guid#   s    Lr,   )eaf_rootc             C   s   dd |  dD S )zGet time IDs and values from an EAF file
    
    Parameters:
        eaf_root: is the root element of an EAF object parsed through ElementTree

    Return a dictionary of time ID and value pairs
    c             S   s   i | ]}|j d  |j d qS )Z
TIME_VALUEZTIME_SLOT_ID)r   )r   r   r   r   r
   
<dictcomp>>   s   ztime_values.<locals>.<dictcomp>z.//TIME_SLOT)findall)r-   r   r   r
   time_values2   s    r0   )tokenized_utt	phrase_elr   c       	      C   s   t d}x| D ]}t jddt id}|dkrjtd}||rJd}nd}t jd	|d
|dd}nJ|dkrtd}||rd}nd}t jd	|d|dd}ntd||_|	| |	| qW |	| dS )a  Populate a phrase element with a tokenized utterance

    *tokenized_utt* is a list with tokens from an utterance
    *phrase_el* is the element that will be the parent to the words added (below the words el in the phrase_el will be the items w/ translations and notes)
    *lg* is the language whose word-forming characters are to be used in tokenization
    
    Makes changes in place (returns nothing)
    wordswordguid)r   r   z([^0-9A-Za-z\+#_])punctZtxtitemz{}-fonipa-x-etic)typelangr   u"   ([^A-Za-zäëöü'`ꞌꞋ'‘’])z/Language either not given or an invalid string.N)
ETElementr,   r   r   searchformatr   r   append)	r1   r2   r   r3   tokenr4   Zword_formingr8   Ztoken_elr   r   r
   add_word_elB   s(    	






r@   )eafc             C   sJ   xD|  dD ]6}x0|  dd|jd  d D ]}| | q0W qW dS )zRemove the tiers that use the "Included In" stereotype constraint

    Parameters:
        eaf: The root element of an EAF file
    
    Makes changes in place (returns nothing)    
    z .//*[@CONSTRAINTS='Included_In']z.//*[@LINGUISTIC_TYPE_REF={}]'ZLINGUISTIC_TYPE_IDN)r/   r=   r   remove)rA   Zincluded_inZbad_tierr   r   r
   remove_included_ink   s    $rD   )typingr   r   xml.etree.ElementTreeetreeElementTreer:   r   r   r;   r   r,   r0   r@   rD   r   r   r   r
   <module>   s   	)