Guest User

Untitled

a guest
Jan 17th, 2019
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.32 KB | None | 0 0
  1. import mysql.connector
  2. import nltk
  3. import sys
  4. import re
  5. nltk.data.path.append("/home/wilder/environments/test_env/lib/python3.6/site-packages/nltk_data")
  6. from nltk.corpus import stopwords
  7. from nltk.tokenize import word_tokenize
  8. from postaglist import postagliste_en
  9. ##LEMATIZER######################
  10. from nltk.stem import WordNetLemmatizer
  11. ##
  12. ##STEMMER######################
  13. from nltk.stem.porter import PorterStemmer
  14. ##
  15.  
  16. idretrieved = sys.argv[1]
  17.  
  18. JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64"
  19.  
  20. home = '/home/wilder/environments/test_env/lib/python3.6/site-packages'
  21.  
  22. from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
  23. _path_to_model = home + '/stanford-postagger-full-2018-10-16/models/english-bidirectional-distsim.tagger'
  24. _path_to_jar = home + '/stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar'
  25. st = POS_Tag(_path_to_model, _path_to_jar)
  26.  
  27. def main():
  28. #connexion to the API
  29. conn=mysql.connector.connect(
  30. host='92.175.11.66',
  31. port=4100,
  32. user='winespace',
  33. password='winespace',
  34. database='winespace')
  35. cursor=conn.cursor()
  36.  
  37. #retrieving the aromas through API
  38. sql="SELECT name, category FROM aroma_eng"
  39. cursor.execute(sql)
  40. results=cursor.fetchall()
  41. conn.close()
  42. cursor.close()
  43.  
  44. #tokenize the data's (give aroma with category)
  45. token=nltk.word_tokenize(" ".join(str(x) for x in list(results)))
  46. #tokenize the data's (give aroma with category used to retrieve the category)
  47. tokens10=[str(x).lower() for x in list(results)]
  48.  
  49. #tokenize the data's (give aroma without category)
  50. tokens=nltk.word_tokenize(" ".join(str(x[0]).lower() for x in list(results)))
  51.  
  52. #connexion to the API
  53. conn=mysql.connector.connect(
  54. host='92.175.11.66',
  55. port=4100,
  56. user='winespace',
  57. password='winespace',
  58. database='winespace')
  59.  
  60. cursor=conn.cursor()
  61.  
  62. #retrieving the comment through API
  63. sql="SELECT comment_wine FROM wine WHERE id="+idretrieved
  64. cursor.execute(sql)
  65. resultats=cursor.fetchall()
  66. conn.close()
  67. cursor.close()
  68.  
  69. #tokenize the data's comment
  70. tokens2=nltk.word_tokenize(" ".join(str(x) for x in list(resultats)))
  71.  
  72. # list of useless words in NPL
  73. stop_words_en=list(set(stopwords.words('english')))
  74.  
  75. #remove useless words from the comment we analyze
  76. resultat=[]
  77. for word in tokens2:
  78. if (word not in stop_words_en) and ( word != "(" and word !=")" and word !="." and word !="," and word !="'") :
  79. resultat.append(word)
  80.  
  81. ##LEMATIZER######################
  82. lemmatizer = WordNetLemmatizer()
  83. ##STEMMER######################
  84. stemmer = PorterStemmer()
  85. #for the aromas
  86. tokensNotokenize=",".join(str(x[0]).lower() for x in list(results))
  87. tokensNotokenize=tokensNotokenize.split(',')
  88.  
  89. #Build Lematize for aromas#####################
  90. resultat_lemmatized_aromas=[]
  91. for i in tokensNotokenize:
  92. if (' ' in i):
  93. i=i.split(' ')
  94. submod=[]
  95. for j in i:
  96. submod.append(lemmatizer.lemmatize(j))
  97. resultat_lemmatized_aromas.append(' '.join(submod))
  98. else:
  99. resultat_lemmatized_aromas.append(lemmatizer.lemmatize(i))
  100.  
  101. #Build stemmatize for aromas#####################
  102. resultat_stemmed_aromas=[]
  103. for i in tokensNotokenize:
  104. if (' ' in i):
  105. i=i.split(' ')
  106. submod=[]
  107. for j in i:
  108. submod.append(stemmer.stem(j))
  109. resultat_stemmed_aromas.append(' '.join(submod))
  110. else:
  111. resultat_stemmed_aromas.append(stemmer.stem(i))
  112.  
  113. #Build lemmatized for comment#####################
  114. resultat_lemmatized_comment=[lemmatizer.lemmatize(t) for t in resultat]
  115.  
  116. #Build stemmatized for comment#####################
  117. resultat_stemmed_comment = [stemmer.stem(word) for word in resultat]
  118.  
  119. #Matched aromas to store from comments, listedb and pertinence
  120. transmetter100=[]
  121. transmetter99=[]
  122. transmetter=[]
  123. for i in range(len(resultat_stemmed_comment)):
  124. for j in range(len(resultat_stemmed_aromas)):
  125. if resultat_stemmed_comment[i] in resultat_stemmed_aromas[j]:
  126. if (len (resultat_stemmed_comment[i]) == len (resultat_stemmed_aromas[j])):
  127. analyze100= {"matchedaromacomment":"","matchedaromaliste":"","matchedaromacategory":""}
  128. analyze100["matchedaromacomment"]= resultat_stemmed_comment[i]
  129. analyze100["matchedaromaliste"]= resultat_stemmed_aromas[j]
  130. analyze100["matchedaromacategory"]= (tokens10[j].split(','))[1].replace(")","").replace("'","").strip()
  131. transmetter100.append(analyze100)
  132.  
  133. else:
  134. analyze99= {"matchedaromacomment":"","matchedaromaliste":"","matchedaromacategory":""}
  135. analyze99["matchedaromacomment"]= resultat_stemmed_comment[i]
  136. analyze99["matchedaromaliste"]= resultat_stemmed_aromas[j]
  137. analyze99["matchedaromacategory"]= (tokens10[j].split(','))[1].replace(")","").replace("'","").strip()
  138. transmetter99.append(analyze99)
  139. transmetter.append(transmetter99)
  140. transmetter.append(transmetter100)
  141. print(transmetter)
  142.  
  143.  
  144. if __name__ == "__main__":
  145. main()
Add Comment
Please, Sign In to add comment