Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os, pandas as pd, numpy as np, regex as re
- from glob import glob
- from datetime import datetime
- from html import unescape
- start_time = datetime.now()
- # Create empty dataframe with correct column names
- columnNames = ["fileName", "component", "precedingWord", "node", "leftContext", "sentence" ]
- df = pd.DataFrame(data=np.zeros((0,len(columnNames))), columns=columnNames)
- # Create correct path where to fetch files
- subdir = "rawdata"
- path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, subdir))
- # "Cache" regex
- # See http://stackoverflow.com/q/452104/1150683
- p_filename = re.compile(r"[./\\]")
- p_sentence = re.compile(r"<sentence>(.*?)</sentence>")
- p_typography = re.compile(r" (?:(?=[.,:;?!) ])|(?<=\( ))")
- p_non_graph = re.compile(r"[^\x21-\x7E\s]")
- p_quote = re.compile(r"\"")
- p_ellipsis = re.compile(r"\.{3}(?=[^ ])")
- p_last_word = re.compile(r"^.*\b(?<!-)(\w+(?:-\w+)*)[^\w]*$", re.U)
- # Loop files in folder
- for file in glob(path+"\\*.lst"):
- with open(file, encoding="utf-8") as f:
- [n, c] = p_filename.split(file.lower())[-3:-1]
- fn = ".".join([n, c])
- for line in f:
- s = p_sentence.search(unescape(line)).group(1)
- s = s.lower()
- s = p_typography.sub("", s)
- s = p_non_graph.sub("", s)
- s = p_quote.sub("'", s)
- s = p_ellipsis.sub("... ", s)
- if n in re.split(r"[ :?.,]", s):
- lc = re.split(r"(^| )" + n + "( |[!\",.:;?})\]])", s)[0]
- pw = p_last_word.sub("\\1", lc)
- df = df.append([dict(fileName=fn, component=c,
- precedingWord=pw, node=n,
- leftContext=lc, sentence=s)])
- continue
- # Reset indices
- df.reset_index(drop=True, inplace=True)
- # Export dataset
- df.to_csv("dataset/py-dataset.csv", sep="\t", encoding="utf-8")
- # Let's make a frequency list
- # Create new dataframe
- # Define neuter and non_neuter
- neuter = ["het"]
- non_neuter = ["de"]
- # Create crosstab
- df.loc[df.precedingWord.isin(neuter), "gender"] = "neuter"
- df.loc[df.precedingWord.isin(non_neuter), "gender"] = "non_neuter"
- df.loc[df.precedingWord.isin(neuter + non_neuter)==0, "gender"] = "rest"
- freqDf = pd.crosstab(df.node, df.gender)
- freqDf.to_csv("dataset/py-frequencies.csv", sep="\t", encoding="utf-8")
- # How long has the script been running?
- time_difference = datetime.now() - start_time
- print("Time difference of", time_difference)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement