Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Transform string to list in order to simplify the analysis.
- output = open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/dictionnaire.txt", "a+")
- with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/atraiter.txt") as f:
- content = f.readlines()
- content = [x.strip() for x in content]
- content_reforme = []
- for k in content:
- content_reforme.append(k.split())
- ## Creation of a 'dictionnary' giving the most used POS for a given word.
- # For the sake of simplicity, the tables table_of_words and table_of_POS are
- # also created.
- n = len (content_reforme)
- max_current = int(content_reforme[0][0])
- num_max = 0
- table_of_words = [] # Table of words. The are unique, and their index match
- # the index of their POS in table_of_POS.
- table_of_POS = [] # Table of the most used POS for a word.
- for i in range(n-1):
- if content_reforme[i][1] == content_reforme[i+1][1]:
- if int(content_reforme[i+1][0]) > max_current:
- max_current = int(content_reforme[i+1][0])
- num_max = i+1
- else:
- output.write(content_reforme[num_max][1]+' '+content_reforme[num_max][2]+'\n')
- table_of_words.append(content_reforme[num_max][1])
- table_of_POS.append(content_reforme[num_max][2])
- max_current = int(content_reforme[i+1][0])
- num_max = i+1
- if i+1 == n-1:
- output.write(content_reforme[i+1][1]+' '+content_reforme[i+1][2]+'\n')
- table_of_words.append(content_reforme[i+1][1])
- table_of_POS.append(content_reforme[i+1][2])
- ## Close the files
- output.close()
- f.close()
- ## Creation of POS tags using baseline method
- output = open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_baseline.txt", "a+")
- with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/wordsToFindPOS.txt") as f:
- content = f.readlines()
- content = [x.strip() for x in content]
- for k in content:
- i = table_of_words.index(k)
- pos = table_of_POS[i]
- output.write(k+' '+pos+'\n')
- output.close()
- ## Close the files
- output.close()
- f.close()
- ## Compare the baseline POS to the original POS
- with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_baseline.txt") as f:
- originalPOS = f.readlines()
- originalPOS = [x.strip() for x in originalPOS]
- originalPOS_reforme = []
- for k in originalPOS:
- originalPOS_reforme.append(k.split())
- f.close()
- with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_trainSet.txt") as f:
- baselinePOS = f.readlines()
- baselinePOS = [x.strip() for x in baselinePOS]
- baselinePOS_reforme = []
- for k in baselinePOS:
- baselinePOS_reforme.append(k.split())
- f.close()
- n = len(baselinePOS_reforme)
- nb_error = 0
- for i in range(n):
- if baselinePOS_reforme[i][1] != originalPOS_reforme[i][1]:
- nb_error = nb_error + 1
- print(nb_error/n)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement