Advertisement
meta1211

Untitled

Apr 4th, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.72 KB | None | 0 0
  1. import pandas as pd
  2. import json
  3. import os
  4. #import np
  5. #text pre-procassing modules
  6. import re
  7. from autocorrect import spell
  8. from nltk.corpus import stopwords, words
  9. from nltk.corpus import wordnet as wn
  10. import nltk
  11. stop_words = set(stopwords.words('english'))
  12.  
  13. import keras.backend as K
  14. import math
  15. from newspaper import Article
  16. #{"category": "CRIME", "headline": text, "authors": "Melissa Jeltsen", "link": adres, "short_description": text, "date": "2018-05-26"}
  17. import numpy as np # linear algebra
  18. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  19.  
  20. import matplotlib.pyplot as plt
  21.  
  22. from sklearn.feature_extraction.text import CountVectorizer
  23. from keras.preprocessing.text import Tokenizer
  24. from keras.preprocessing.sequence import pad_sequences
  25. from keras import backend as K
  26. from keras.engine.topology import Layer
  27. from keras import initializers, regularizers, constraints
  28.  
  29. from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
  30. from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
  31. from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
  32. from keras.layers import Reshape, merge, Concatenate, Lambda, Average
  33. from keras.models import Sequential, Model, load_model
  34. from keras.callbacks import ModelCheckpoint
  35. from keras.initializers import Constant
  36. from keras.layers.merge import add
  37. from keras import layers
  38. from keras.preprocessing import sequence
  39. from keras.preprocessing.text import Tokenizer, text_to_word_sequence
  40. from keras.utils import np_utils
  41.  
  42. from sklearn.model_selection import train_test_split
  43. from sklearn.metrics import confusion_matrix
  44.  
  45.  
  46. tokenizer = Tokenizer(num_words=300, split=' ')
  47. max_fatures = 2000
  48. embed_dim = 128
  49. lstm_out = 196
  50. batch_size = 32
  51.  
  52. headlineModel = 'headlinesClissiferWithNormalizedData v.1.h5'
  53. textModel = 'model.h5'
  54.  
  55. patterns =[
  56. r'<[^>]+>', # HTML tags
  57. r'(?:@[\w_]+)', # @-mentions
  58. r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
  59. r'http[s]?://(?:[\w+]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
  60. r'\d+', # numbers (Not sure should we delete it on this stage)
  61. r"[']\w+", # deleting all after ' (ex. cat's => cat)
  62. r"[:;=%x][o0\-^_]?[ds\\\[\]\(\)/i|><]+", # smiles
  63. ]
  64.  
  65. def ClearFromPatterns(str, patterns):
  66. result = str
  67. for pattern in patterns:
  68. result = re.sub(pattern, '', result)
  69. return result
  70.  
  71. def Split(text):
  72. regex = r'(\w*) '
  73. result=re.findall(regex,text)
  74. return result
  75.  
  76. def DeletePunctuation(text):
  77. return ' '.join([word for word in Split(text) if len(word) > 0])
  78.  
  79. def DeleteStopWords(words, stopWords):
  80. return [word for word in words if word not in stopWords]
  81.  
  82. def CorrectSpelling(words):
  83. text = [spell(word).lower() if len(word) > 3 else word for word in words]
  84. return text
  85.  
  86. def MorphyCorrection(words):
  87. res = []
  88. for word in words:
  89. newWord = wn.morphy(word) #Returns None if it cant change word
  90. if newWord:
  91. res.append(newWord)
  92. else:
  93. res.append(word)
  94. return res
  95.  
  96. def GrammarPreProcessing(text):
  97. text = text.lower()
  98. text = ClearFromPatterns(text, patterns)
  99. text = DeletePunctuation(text)
  100. words = text.split(' ')
  101. words = DeleteStopWords(words, stop_words)
  102. #Removed cause we are working with news data
  103. #words = CorrectSpelling(words)
  104. #words = MorphyCorrection(words)
  105. return ' '.join(words)
  106.  
  107. def TextsGrammarPreProcessing(texts):
  108. return [GrammarPreProcessing(text) for text in texts]
  109.  
  110. def GetTexts(links):
  111. total = len(links)
  112. texts = []
  113. for i in range(0, total):
  114. texts.append(GetText(links[i]))
  115. print(i + 1, 'of ', total, ' links processed(', round(float(i)/ total, 4) * 100, '%)')
  116. return texts
  117.  
  118. def GetText(link):
  119. try:
  120. article = Article(link)
  121. article.download()
  122. article.parse()
  123. return article.text
  124. except:
  125. return pd.NaT
  126.  
  127.  
  128. def CreateModel(X, Y, path):
  129. tokenizer = Tokenizer(num_words=max_fatures, split=' ')
  130. tokenizer.fit_on_texts(X)
  131. #data['text'] = [GrammarPreProcessing(row['text']) for index, row in data.iterrows()]
  132. X = tokenizer.texts_to_sequences(X)
  133. X = pad_sequences(X, maxlen= 300)
  134. model = Sequential()
  135. model.add(layers.Embedding(max_fatures, embed_dim, input_length=300))
  136. model.add(layers.Dense(10, activation='relu'))
  137. model.add(SpatialDropout1D(0.4))
  138. model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
  139. model.add(layers.Dense(2, activation='softmax'))
  140. model.compile(optimizer='adam',
  141. loss= 'binary_crossentropy',
  142. metrics=['accuracy'])
  143. Y = pd.get_dummies(Y).values
  144. X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
  145. batch_size = 32
  146. history = model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)
  147. scores = model.evaluate(X_test, Y_test)
  148. print(scores)
  149. model.save(path)
  150.  
  151. def PrepareSet(data, maxVectorLen = 300):
  152. tokenizer.fit_on_texts(data)
  153. X = tokenizer.texts_to_sequences(data)
  154. X = pad_sequences(X, maxlen= maxVectorLen)
  155. return X
  156.  
  157. def DeleteShortWords(text):
  158. return ' '.join([word for word in text.split() if len(word) > 1])
  159.  
  160. def GetFreq(texts):
  161. freq = nltk.FreqDist()
  162. for text in texts:
  163. text = GrammarPreProcessing(text)
  164. print(text)
  165. tokens = nltk.word_tokenize(text)
  166. bigrams = list(nltk.bigrams(tokens))
  167. freq.update(bigrams)
  168. return freq
  169.  
  170. def AnalyzeBigrams(crimeTexts, notCrimeTexts):
  171. print(notCrime.head())
  172. print('---------Crime---------')
  173. freq1 = GetFreq(crimeTexts)
  174. print(freq1.most_common(10))
  175. freq1.plot(20)
  176. print('---------Not crime---------')
  177. freq2 = GetFreq(notCrimeTexts)
  178. print(freq2.most_common(10))
  179. freq2.plot(20)
  180. retrun (freq1, freq2)
  181.  
  182. def LabelAndNormalizeData(df, dataRange):
  183. crime = df.loc[df.category == 'CRIME']
  184. notCrime = df.loc[df.category != 'CRIME']
  185. crime = crime.iloc[dataRange[0]:dataRange[1]]
  186. data = crime.append(notCrime.iloc[dataRange[0]:dataRange[1]])
  187. data['labels'] = ['1' if row['category'] == 'CRIME' else '0' for index,row in data.iterrows()]
  188. data = data.iloc[np.random.permutation(len(data))]
  189. return data
  190.  
  191. def PrintStatistic(data, column):
  192. print(data.head(20))
  193. correct = 0
  194. totalCrime = 0
  195. correctCrime = 0
  196. falseCrime = 0
  197. falseNotCrime = 0
  198. total = len(data)
  199. for index,row in data.iterrows():
  200. if row['labels'] == '1':
  201. totalCrime += 1
  202. if row[column] == '1':
  203. correctCrime += 1
  204. if row['labels'] == row[column]:
  205. correct += 1
  206. elif row[column] == '1':
  207. falseCrime += 1
  208. else:
  209. falseNotCrime += 1
  210. print('Data:')
  211. print(df.category.value_counts())
  212. print('total: ', correct/float(total))
  213. print('Crime found ratio: ', float(correctCrime)/totalCrime)
  214. print('falseNotCrime: ', falseNotCrime/float(total - correct))
  215. print('falseCrime: ', falseCrime/float(total - correct))
  216.  
  217.  
  218.  
  219. def MakePrediction(modelPath, data):
  220. model = load_model(headlineModel)
  221. data = PrepareSet(data)
  222. return ['0' if pr[0] > pr[1] else '1' for pr in model.predict(data)]
  223.  
  224.  
  225. #dataSet = pd.read_json('news.json', lines = True)
  226. #df = pd.read_json('ProcessedDataSet.json')
  227. #df['link'] = dataSet['link']
  228. #df = df.iloc[0:60]
  229. #CreateModel(LabelAndNormalizeData(df, [0, 1500]), 'headlinesClissiferWithNormalizedData v.1.h5')
  230. #df['predicted'] = MakePrediction(headlineModel, df['text'].values)
  231. #print(df.head())
  232. texts = pd.read_json('DataForTextProcessing.json')
  233. texts = texts.astype({"labels": str})
  234. #learnData = LabelAndNormalizeData(dataSet, [0, 1000])
  235. #learnData['text'] = GetTexts(learnData['link'].values)
  236. #learnData = learnData.dropna()
  237. #learnData['ProcessedText'] = TextsGrammarPreProcessing(learnData['text'].values)
  238. #learnData.to_json('DataForTextProcessing.json')
  239. #CreateModel(learnData['ProcessedText'].values, learnData['labels'].values, 'textModel v.2.h5')
  240. #df['analysis'] = MakePrediction('textModel v.2.h5', GetTexts(df['link'].values))
  241. #df = df[['labels','predicted', 'analysis']]
  242. print('columns:')
  243. print(texts.columns.tolist())
  244. print(texts.dtypes)
  245. result = pd.DataFrame()
  246. result['label'] = texts['labels']
  247. result['headlinePr'] = MakePrediction(headlineModel, texts['headline'].values)
  248. result['textPr'] = MakePrediction('textModel.h5', texts['text'].values)
  249. print(result.head(5))
  250.  
  251. totalCrime = 0
  252. correctHeadTotal = 0
  253. correctHeadCrime = 0
  254. textAndHeadCorrect = 0
  255. onlyHeadCorrect = 0
  256. textCorrect = 0
  257. corrrectCrimeText = 0
  258. correctCrimeHead = 0
  259. for index,row in result.iterrows():
  260. if row['label'] == '1':
  261. totalCrime += 1
  262. if row['label'] == row['textPr']:
  263. corrrectCrimeText +=1
  264. if row['label'] == row['headlinePr']:
  265. correctCrimeHead += 1
  266. if row['label'] == row['textPr']:
  267. textCorrect += 1
  268. if row['label'] == row['headlinePr']:
  269. correctHeadTotal += 1
  270. if row['label'] == '1':
  271. correctHeadCrime +=1
  272. if row['label'] == row['textPr']:
  273. textAndHeadCorrect += 1
  274. else:
  275. onlyHeadCorrect += 1
  276. print('correctHeadTotal:', float(correctHeadTotal) / len(result))
  277. print('textCorrectTotal:', float(textCorrect) / len(result))
  278. print('textAndHeadCorrect:', float(textAndHeadCorrect) / len(result))
  279. print('onlyHeadCorrect:', float(onlyHeadCorrect) / correctHeadTotal)
  280. print()
  281. print('corrrectCrimeText:', float(corrrectCrimeText)/totalCrime)
  282. print('correctHeadCrime:', float(correctCrimeHead)/totalCrime)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement