Untitled

## Transform string to list in order to simplify the analysis.
output = open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/dictionnaire.txt", "a+")

with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/atraiter.txt") as f:
    content = f.readlines()

content = [x.strip() for x in content]
content_reforme = []

for k in content:
    content_reforme.append(k.split())

## Creation of a 'dictionnary' giving the most used POS for a given word.
# For the sake of simplicity, the tables table_of_words and table_of_POS are
# also created.
n = len (content_reforme)

max_current = int(content_reforme[0][0])
num_max = 0

table_of_words = [] # Table of words. The are unique, and their index match
                    # the index of their POS in table_of_POS.
table_of_POS = [] # Table of the most used POS for a word.

for i in range(n-1):
    if content_reforme[i][1] == content_reforme[i+1][1]:
        if int(content_reforme[i+1][0]) > max_current:
            max_current = int(content_reforme[i+1][0])
            num_max = i+1

    else:
        output.write(content_reforme[num_max][1]+' '+content_reforme[num_max][2]+'\n')
        table_of_words.append(content_reforme[num_max][1])
        table_of_POS.append(content_reforme[num_max][2])
        max_current = int(content_reforme[i+1][0])
        num_max = i+1

        if i+1 == n-1:
            output.write(content_reforme[i+1][1]+' '+content_reforme[i+1][2]+'\n')
            table_of_words.append(content_reforme[i+1][1])
            table_of_POS.append(content_reforme[i+1][2])
## Close the files
output.close()
f.close()

## Creation of POS tags using baseline method
output = open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_baseline.txt", "a+")

with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/wordsToFindPOS.txt") as f:
    content = f.readlines()

content = [x.strip() for x in content]

for k in content:
    i = table_of_words.index(k)
    pos = table_of_POS[i]
    output.write(k+' '+pos+'\n')

output.close()

## Close the files
output.close()
f.close()

## Compare the baseline POS to the original POS
with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_baseline.txt") as f:
    originalPOS = f.readlines()

originalPOS = [x.strip() for x in originalPOS]
originalPOS_reforme = []

for k in originalPOS:
    originalPOS_reforme.append(k.split())

f.close()

with open("/home/felix/Documents/TokyoTech/NaturalLanguageProcessing/Assignment3/POS_trainSet.txt") as f:
    baselinePOS = f.readlines()

baselinePOS = [x.strip() for x in baselinePOS]
baselinePOS_reforme = []

for k in baselinePOS:
    baselinePOS_reforme.append(k.split())

f.close()

n = len(baselinePOS_reforme)
nb_error = 0

for i in range(n):
    if baselinePOS_reforme[i][1] != originalPOS_reforme[i][1]:
        nb_error = nb_error + 1

print(nb_error/n)