Advertisement
Guest User

Untitled

a guest
Aug 20th, 2015
319
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.49 KB | None | 0 0
  1. import os, pandas as pd, numpy as np, regex as re
  2.  
  3. from glob import glob
  4. from datetime import datetime
  5. from html import unescape
  6.  
  7. start_time = datetime.now()
  8.  
  9. # Create empty dataframe with correct column names
  10. columnNames = ["fileName", "component", "precedingWord", "node", "leftContext", "sentence" ]
  11. df = pd.DataFrame(data=np.zeros((0,len(columnNames))), columns=columnNames)
  12.  
  13. # Create correct path where to fetch files
  14. subdir = "rawdata"
  15. path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, subdir))
  16.  
  17. # "Cache" regex
  18. # See http://stackoverflow.com/q/452104/1150683
  19. p_filename = re.compile(r"[./\\]")
  20.  
  21. p_sentence = re.compile(r"<sentence>(.*?)</sentence>")
  22. p_typography = re.compile(r" (?:(?=[.,:;?!) ])|(?<=\( ))")
  23. p_non_graph = re.compile(r"[^\x21-\x7E\s]")
  24. p_quote = re.compile(r"\"")
  25. p_ellipsis = re.compile(r"\.{3}(?=[^ ])")
  26.  
  27. p_last_word = re.compile(r"^.*\b(?<!-)(\w+(?:-\w+)*)[^\w]*$", re.U)
  28.  
  29. # Loop files in folder
  30. for file in glob(path+"\\*.lst"):
  31.     with open(file, encoding="utf-8") as f:
  32.         [n, c] = p_filename.split(file.lower())[-3:-1]
  33.         fn = ".".join([n, c])
  34.         for line in f:
  35.             s = p_sentence.search(unescape(line)).group(1)
  36.             s = s.lower()
  37.             s = p_typography.sub("", s)
  38.             s = p_non_graph.sub("", s)
  39.             s = p_quote.sub("'", s)
  40.             s = p_ellipsis.sub("... ", s)
  41.  
  42.             if n in re.split(r"[ :?.,]", s):
  43.                 lc = re.split(r"(^| )" + n + "( |[!\",.:;?})\]])", s)[0]
  44.    
  45.                 pw = p_last_word.sub("\\1", lc)
  46.    
  47.                 df = df.append([dict(fileName=fn, component=c,
  48.                                    precedingWord=pw, node=n,
  49.                                    leftContext=lc, sentence=s)])
  50.             continue
  51.  
  52. # Reset indices
  53. df.reset_index(drop=True, inplace=True)
  54.  
  55. # Export dataset
  56. df.to_csv("dataset/py-dataset.csv", sep="\t", encoding="utf-8")
  57.  
  58. # Let's make a frequency list
  59. # Create new dataframe
  60.  
  61. # Define neuter and non_neuter
  62. neuter = ["het"]
  63. non_neuter = ["de"]
  64.  
  65. # Create crosstab
  66. df.loc[df.precedingWord.isin(neuter), "gender"] = "neuter"
  67. df.loc[df.precedingWord.isin(non_neuter), "gender"] = "non_neuter"
  68. df.loc[df.precedingWord.isin(neuter + non_neuter)==0, "gender"] = "rest"
  69.  
  70. freqDf = pd.crosstab(df.node, df.gender)
  71.  
  72. freqDf.to_csv("dataset/py-frequencies.csv", sep="\t", encoding="utf-8")
  73.  
  74. # How long has the script been running?
  75. time_difference = datetime.now() - start_time
  76. print("Time difference of", time_difference)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement