Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import xml.etree.ElementTree as ET
- import re
- def occupyFiles(inputFileLoc):
- inputFile = open(inputFileLoc,"r")
- tweetFile = open("tweets.txt","w")
- termFile = open("terms.txt","w")
- dateFile = open("dates.txt","w")
- tree = ET.parse(inputFileLoc)
- root = tree.getroot()
- inputFile.readline()
- inputFile.readline()
- #each different tweet in xml file
- for child in root:
- text=""
- #splits on all non alphanumeric/underscore characters
- text=re.sub('[^A-Za-z0-9"]+'," ",child[2].text)
- #adding to the term file, in order: terms, names, locations
- for term in text.split():
- word=""
- for letter in term:
- #print(ord(letter),letter)
- if((ord(letter[0])==34)):
- termFile.write('t-quot:'+child[0].text+'\n')
- continue
- if(ord(letter[0])<128):
- word=word+letter.lower()
- if(len(word)>2):
- termFile.write('t-'+word+':'+child[0].text+'\n')
- #writes names
- for name in child[4][0].text.split():
- word=""
- for letter in name:
- if(ord(letter[0])<128):
- word=word+letter.lower()
- if len(word)>0:
- termFile.write('n-'+word+':'+child[0].text+'\n')
- #writes locations (only alphanumeric characters)
- if (child[4][1].text is not None):
- for location in child[4][1].text.split():
- word=""
- location=re.sub("[^a-zA-Z0-9]+","",location)
- for letter in location:
- if(ord(letter[0])<128):
- word=word+letter.lower()
- if word != "":
- termFile.write('l-'+location.lower()+':'+child[0].text+'\n')
- #writing to the date file
- dateFile.write(child[1].text+":"+child[0].text+'\n')
- #writing to the tweet file
- tweetFile.write(child[0].text+":"+inputFile.readline())
- inputFile.close()
- tweetFile.close()
- termFile.close()
- dateFile.close()
- return
- inputfile=input("Enter input file: ")
- occupyFiles(inputfile)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement