B
    <b                 @   s   d dl mZ d dlmZ d dlZd dlm  mZ e	dZ
e
ZeZefedddZeedd	d
ZejdddZdd ZejdddZefee ejedddZejdddZdS )    )collect)ListNu   ([^A-Za-zäëöüÄËÖÜ'`ꞌꞋ'‘’\u00E4\u00EB\u00F6\u00FC\u00C4\u00CB\u00D6\u00DC\u0308áéíóúÁÉÍÓÚ\u00E1\u00E9\u00ED\u00F3\u00FA\u00C1\u00C9\u00CD\u00D3\u00DA]))textc             C   s   t || S )zDetermine if a string has non word-forming characters, using the defined word_forming regex

    Parameters:
        text: the string to be checked
    )boolsearch)r   word_forming r   //home/sunny/Documents/lx/flexible/flexible05.pyis_punct   s    r
   )phraselgc             C   sr   | rjdd dd t | D D }g }d}x:|D ]2}t|rF||7 }q0|rX|| d}|| q0W |S g S dS )zTokenize an utterance based on specified word-forming characters.

    Parameters:
        phrase: the string to be tokenized
        lg: the language whose word-forming characters are to be used in tokenization
    
    Return list of tokens
    c             S   s   g | ]}|r|qS r   r   ).0jr   r   r	   
<listcomp>9   s    ztokenize.<locals>.<listcomp>c             S   s   g | ]}|  qS r   )strip)r   ir   r   r	   r   9   s     N)r   splitr
   append)r   r   tokensZcollected_tokensZpunct_charstokenr   r   r	   tokenize!   s    


r   )elc          
   C   s(   t d| jdt| jd| jdt|  dS )zPrint the tag, attributes, text, and number of children of an ET.Element
    
    Parameters:
        el: Element to be printed
    zTag:z
Attrs:z
Text:z
No. of Children:N)printtagstrattribr   len)r   r   r   r	   print_el_infoH   s    r   c              C   s   t tddd d } t| dd }ddt|  | }|dd	 d
 |d	d  d
 |dd  d
 |dd  d
 |dd  }tdddt|  |S )zGenerate FLEx guid based on offset defined in offset.txt

    Increments offset upon use.

    Return guid in format [0-f]{8}-([0-f]{4}-){3}[0-f]{12}
    z
offset.txtr)mode      N0       -         w)intopenreadhexr   writer   )Zglobal_offsetZnew_guid_numZnew_guid_strZnew_guidr   r   r	   generate_guidQ   s    Lr0   )eaf_rootc             C   s   dd |  dD S )zGet time IDs and values from an EAF file
    
    Parameters:
        eaf_root: is the root element of an EAF object parsed through ElementTree

    Return a dictionary of time ID and value pairs
    c             S   s   i | ]}|j d  |j d qS )Z
TIME_VALUEZTIME_SLOT_ID)r   )r   r   r   r   r	   
<dictcomp>l   s   ztime_values.<locals>.<dictcomp>z.//TIME_SLOT)findall)r1   r   r   r	   time_values`   s    r4   )tokenized_utt	phrase_elr   c       	      C   s   t d}x| D ]}t jddt id}|dkrZ||r@d}nd}t jd||d	d}np|d
kr||rrd}nd}t jd||d	d}n>|dkrt}||rd}nd}t jd||d	d}ntd||_|| || qW || dS )a  Populate a phrase element with a tokenized utterance

    *tokenized_utt* is a list with tokens from an utterance
    *phrase_el* is the element that will be the parent to the words added (below the words el in the phrase_el will be the items w/ translations and notes)
    *lg* is the language whose word-forming characters are to be used in tokenization
    
    Makes changes in place (returns nothing)
    wordswordguid)r   ZtcapunctZtxtitem)typelangmtocpsz/Language either not given or an invalid string.N)ETElementr0   r   cps_word_forming
ValueErrorr   r   )	r5   r6   r   r   r7   r   r8   r<   Ztoken_elr   r   r	   add_word_elp   s0    	





rD   )eafc             C   sJ   xD|  dD ]6}x0|  dd|jd  d D ]}| | q0W qW dS )zRemove the tiers that use the "Included In" stereotype constraint

    Parameters:
        eaf: The root element of an EAF file
    
    Makes changes in place (returns nothing)    
    z .//*[@CONSTRAINTS='Included_In']z.//*[@LINGUISTIC_TYPE_REF={}]'ZLINGUISTIC_TYPE_IDN)r3   formatr   remove)rE   Zincluded_inZbad_tierr   r   r	   remove_included_in   s    $rI   )gcr   typingr   rexml.etree.ElementTreeetreeElementTreer@   compileZmto_word_formingr   rB   r   r
   r   rA   r   r0   r4   rD   rI   r   r   r   r	   <module>   s   
	'	1