Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import xml.etree.ElementTree as ET
- import requests
- arquivo = "C1_Extrato_2_Palavras.xml"
- tree = ET.parse(arquivo)
- root = tree.getroot()
- filtro = "*"
- for child in root.iter(filtro):
- print(child.tag, child.text)
- print("n")
- for child in root.findall("body"):
- for esse in child.findall("graph"):
- print(esse.text)
- corpus
- body
- s
- graph
- terminals
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- t None
- nonterminals
- nt
- edge None
- nt
- edge None
- edge None
- edge None
- edge None
- nt
- <?xml version="1.0" encoding="UTF-8"?>
- <corpus>
- <body>
- <s id="s1" ref="1" source="Running text" forest="1" text="Um acidente aéreo na localidade de Bukavu, no leste da República Democrática do Congo, matou 17 pessoas na quinta-feira à tarde, informou hoje um porta-voz das Nações Unidas.">
- <graph root="s1_500">
- <terminals>
- <t id="s1_1" word="Um" lemma="um" pos="art" morph="M S" extra="* "/>
- <t id="s1_2" word="acidente" lemma="acidente" pos="n" morph="M S" sem="event" extra="--"/>
- <t id="s1_3" word="aéreo" lemma="aéreo" pos="adj" morph="M S" extra="nh np-close"/>
- <t id="s1_4" word="em" lemma="em" pos="prp" morph="--" extra="sam- np-long"/>
- <t id="s1_5" word="a" lemma="o" pos="art" morph="F S" extra="-sam "/>
- <t id="s1_6" word="localidade" lemma="localidade" pos="n" morph="F S" sem="Labs Lciv" extra="--"/>
- <t id="s1_7" word="de" lemma="de" pos="prp" morph="--" extra="np-close"/>
- <t id="s1_8" word="Bukavu" lemma="Bukavu" pos="prop" morph="M/F S" extra="civ * heur"/>
- <t id="s1_9" word="," lemma="--" pos="pu" morph="--" extra="--"/>
- <t id="s1_10" word="em" lemma="em" pos="prp" morph="--" extra="sam-"/>
- <t id="s1_11" word="o" lemma="o" pos="art" morph="M S" extra="-sam "/>
- <t id="s1_12" word="leste" lemma="leste" pos="n" morph="M S" sem="dir" extra="--"/>
- <t id="s1_13" word="de" lemma="de" pos="prp" morph="--" extra="sam- np-close"/>
- <t id="s1_14" word="a" lemma="o" pos="art" morph="F S" extra="-sam "/>
- <t id="s1_15" word="República_Democrática_do_Congo" lemma="República_Democrática_do_Congo" pos="prop" morph="F S" extra="civ *"/>
- <t id="s1_16" word="," lemma="--" pos="pu" morph="--" extra="--"/>
- <t id="s1_17" word="matou" lemma="matar" pos="v-fin" morph="PS 3S IND VFIN" extra="cjt-head cjt-head-STA fmc mv"/>
- <t id="s1_18" word="17" lemma="17" pos="num" morph="F P" extra="card"/>
- <t id="s1_19" word="pessoas" lemma="pessoa" pos="n" morph="F P" sem="H" extra="--"/>
- <t id="s1_20" word="em" lemma="em" pos="prp" morph="--" extra="sam-"/>
- <t id="s1_21" word="a" lemma="o" pos="art" morph="F S" extra="-sam "/>
- <t id="s1_22" word="quinta-feira" lemma="quinta-feira" pos="n" morph="F S" sem="temp" extra="--"/>
- <t id="s1_23" word="a" lemma="a" pos="prp" morph="--" extra="sam-"/>
- <t id="s1_24" word="a" lemma="o" pos="art" morph="F S" extra="-sam "/>
- <t id="s1_25" word="tarde" lemma="tarde" pos="n" morph="F S" sem="per" extra="--"/>
- <t id="s1_26" word="," lemma="--" pos="pu" morph="--" extra="--"/>
- <t id="s1_27" word="informou" lemma="informar" pos="v-fin" morph="PS 3S IND VFIN" extra="nosubj nosubj cjt-STA vH fmc mv"/>
- <t id="s1_28" word="hoje" lemma="hoje" pos="adv" morph="--" extra="--"/>
- <t id="s1_29" word="um" lemma="um" pos="art" morph="M S" extra="--"/>
- <t id="s1_30" word="porta-voz" lemma="porta-voz" pos="n" morph="M S" sem="tool Hprof" extra="--"/>
- <t id="s1_31" word="de" lemma="de" pos="prp" morph="--" extra="sam-"/>
- <t id="s1_32" word="as" lemma="o" pos="art" morph="F P" extra="-sam "/>
- <t id="s1_33" word="Nações_Unidas" lemma="Nações_Unidas" pos="prop" morph="F P" extra="org * newlex"/>
- <t id="s1_34" word="." lemma="--" pos="pu" morph="--" extra="--"/>
- </terminals>
- <nonterminals>
- <nt id="s1_500" cat="s">
- <edge label="STA" idref="s1_501"/>
- </nt>
- <nt id="s1_501" cat="par">
- <edge label="CJT" idref="s1_502"/>
- <edge label="PU" idref="s1_26"/>
- <edge label="CJT" idref="s1_516"/>
- <edge label="PU" idref="s1_34"/>
- </nt>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement