Advertisement
Guest User

Untitled

a guest
Nov 21st, 2017
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.56 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. import re
  3. def occupyFiles(inputFileLoc):
  4.         inputFile = open(inputFileLoc,"r")
  5.         tweetFile = open("tweets.txt","w")
  6.         termFile = open("terms.txt","w")
  7.         dateFile = open("dates.txt","w")
  8.         tree = ET.parse(inputFileLoc)
  9.         root = tree.getroot()
  10.         inputFile.readline()
  11.         inputFile.readline()
  12.         #each different tweet in  xml file
  13.         for child in root:
  14.                 text=""
  15.                 #splits on all non alphanumeric/underscore characters
  16.                 text=re.sub('[^A-Za-z0-9"]+'," ",child[2].text)
  17.                 #adding to the term file, in order: terms, names, locations  
  18.                 for term in text.split():
  19.                     word=""
  20.                     for letter in term:
  21.                         #print(ord(letter),letter)
  22.                         if((ord(letter[0])==34)):
  23.                             termFile.write('t-quot:'+child[0].text+'\n')
  24.                             continue
  25.                         if(ord(letter[0])<128):
  26.                             word=word+letter.lower()
  27.                     if(len(word)>2):
  28.                         termFile.write('t-'+word+':'+child[0].text+'\n')
  29.                 #writes names
  30.                 for name in child[4][0].text.split():
  31.                     word=""
  32.                     for letter in name:
  33.                         if(ord(letter[0])<128):
  34.                             word=word+letter.lower()
  35.                     if len(word)>0:
  36.                         termFile.write('n-'+word+':'+child[0].text+'\n')
  37.                 #writes locations (only alphanumeric characters)
  38.                 if (child[4][1].text is not None):  
  39.                     for location in child[4][1].text.split():
  40.                         word=""
  41.                         location=re.sub("[^a-zA-Z0-9]+","",location)
  42.                         for letter in location:
  43.                             if(ord(letter[0])<128):
  44.                                 word=word+letter.lower()
  45.                         if word != "":        
  46.                             termFile.write('l-'+location.lower()+':'+child[0].text+'\n')
  47.                 #writing to the date file
  48.                 dateFile.write(child[1].text+":"+child[0].text+'\n')
  49.                 #writing to the tweet file
  50.                 tweetFile.write(child[0].text+":"+inputFile.readline())
  51.        
  52.         inputFile.close()
  53.         tweetFile.close()
  54.         termFile.close()
  55.         dateFile.close()
  56.         return
  57.  
  58. inputfile=input("Enter input file: ")
  59. occupyFiles(inputfile)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement