Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import json
- import os
- #import np
- #text pre-procassing modules
- import re
- from autocorrect import spell
- from nltk.corpus import stopwords, words
- from nltk.corpus import wordnet as wn
- import nltk
- stop_words = set(stopwords.words('english'))
- import keras.backend as K
- import math
- from newspaper import Article
- #{"category": "CRIME", "headline": text, "authors": "Melissa Jeltsen", "link": adres, "short_description": text, "date": "2018-05-26"}
- import numpy as np # linear algebra
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
- import matplotlib.pyplot as plt
- from sklearn.feature_extraction.text import CountVectorizer
- from keras.preprocessing.text import Tokenizer
- from keras.preprocessing.sequence import pad_sequences
- from keras import backend as K
- from keras.engine.topology import Layer
- from keras import initializers, regularizers, constraints
- from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
- from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
- from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
- from keras.layers import Reshape, merge, Concatenate, Lambda, Average
- from keras.models import Sequential, Model, load_model
- from keras.callbacks import ModelCheckpoint
- from keras.initializers import Constant
- from keras.layers.merge import add
- from keras import layers
- from keras.preprocessing import sequence
- from keras.preprocessing.text import Tokenizer, text_to_word_sequence
- from keras.utils import np_utils
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import confusion_matrix
- tokenizer = Tokenizer(num_words=300, split=' ')
- max_fatures = 2000
- embed_dim = 128
- lstm_out = 196
- batch_size = 32
- headlineModel = 'headlinesClissiferWithNormalizedData v.1.h5'
- textModel = 'model.h5'
- patterns =[
- r'<[^>]+>', # HTML tags
- r'(?:@[\w_]+)', # @-mentions
- r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
- r'http[s]?://(?:[\w+]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
- r'\d+', # numbers (Not sure should we delete it on this stage)
- r"[']\w+", # deleting all after ' (ex. cat's => cat)
- r"[:;=%x][o0\-^_]?[ds\\\[\]\(\)/i|><]+", # smiles
- ]
- def ClearFromPatterns(str, patterns):
- result = str
- for pattern in patterns:
- result = re.sub(pattern, '', result)
- return result
- def Split(text):
- regex = r'(\w*) '
- result=re.findall(regex,text)
- return result
- def DeletePunctuation(text):
- return ' '.join([word for word in Split(text) if len(word) > 0])
- def DeleteStopWords(words, stopWords):
- return [word for word in words if word not in stopWords]
- def CorrectSpelling(words):
- text = [spell(word).lower() if len(word) > 3 else word for word in words]
- return text
- def MorphyCorrection(words):
- res = []
- for word in words:
- newWord = wn.morphy(word) #Returns None if it cant change word
- if newWord:
- res.append(newWord)
- else:
- res.append(word)
- return res
- def GrammarPreProcessing(text):
- text = text.lower()
- text = ClearFromPatterns(text, patterns)
- text = DeletePunctuation(text)
- words = text.split(' ')
- words = DeleteStopWords(words, stop_words)
- #Removed cause we are working with news data
- #words = CorrectSpelling(words)
- #words = MorphyCorrection(words)
- return ' '.join(words)
- def TextsGrammarPreProcessing(texts):
- return [GrammarPreProcessing(text) for text in texts]
- def GetTexts(links):
- total = len(links)
- texts = []
- for i in range(0, total):
- texts.append(GetText(links[i]))
- print(i + 1, 'of ', total, ' links processed(', round(float(i)/ total, 4) * 100, '%)')
- return texts
- def GetText(link):
- try:
- article = Article(link)
- article.download()
- article.parse()
- return article.text
- except:
- return pd.NaT
- def CreateModel(X, Y, path):
- tokenizer = Tokenizer(num_words=max_fatures, split=' ')
- tokenizer.fit_on_texts(X)
- #data['text'] = [GrammarPreProcessing(row['text']) for index, row in data.iterrows()]
- X = tokenizer.texts_to_sequences(X)
- X = pad_sequences(X, maxlen= 300)
- model = Sequential()
- model.add(layers.Embedding(max_fatures, embed_dim, input_length=300))
- model.add(layers.Dense(10, activation='relu'))
- model.add(SpatialDropout1D(0.4))
- model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
- model.add(layers.Dense(2, activation='softmax'))
- model.compile(optimizer='adam',
- loss= 'binary_crossentropy',
- metrics=['accuracy'])
- Y = pd.get_dummies(Y).values
- X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
- batch_size = 32
- history = model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)
- scores = model.evaluate(X_test, Y_test)
- print(scores)
- model.save(path)
- def PrepareSet(data, maxVectorLen = 300):
- tokenizer.fit_on_texts(data)
- X = tokenizer.texts_to_sequences(data)
- X = pad_sequences(X, maxlen= maxVectorLen)
- return X
- def DeleteShortWords(text):
- return ' '.join([word for word in text.split() if len(word) > 1])
- def GetFreq(texts):
- freq = nltk.FreqDist()
- for text in texts:
- text = GrammarPreProcessing(text)
- print(text)
- tokens = nltk.word_tokenize(text)
- bigrams = list(nltk.bigrams(tokens))
- freq.update(bigrams)
- return freq
- def AnalyzeBigrams(crimeTexts, notCrimeTexts):
- print(notCrime.head())
- print('---------Crime---------')
- freq1 = GetFreq(crimeTexts)
- print(freq1.most_common(10))
- freq1.plot(20)
- print('---------Not crime---------')
- freq2 = GetFreq(notCrimeTexts)
- print(freq2.most_common(10))
- freq2.plot(20)
- retrun (freq1, freq2)
- def LabelAndNormalizeData(df, dataRange):
- crime = df.loc[df.category == 'CRIME']
- notCrime = df.loc[df.category != 'CRIME']
- crime = crime.iloc[dataRange[0]:dataRange[1]]
- data = crime.append(notCrime.iloc[dataRange[0]:dataRange[1]])
- data['labels'] = ['1' if row['category'] == 'CRIME' else '0' for index,row in data.iterrows()]
- data = data.iloc[np.random.permutation(len(data))]
- return data
- def PrintStatistic(data, column):
- print(data.head(20))
- correct = 0
- totalCrime = 0
- correctCrime = 0
- falseCrime = 0
- falseNotCrime = 0
- total = len(data)
- for index,row in data.iterrows():
- if row['labels'] == '1':
- totalCrime += 1
- if row[column] == '1':
- correctCrime += 1
- if row['labels'] == row[column]:
- correct += 1
- elif row[column] == '1':
- falseCrime += 1
- else:
- falseNotCrime += 1
- print('Data:')
- print(df.category.value_counts())
- print('total: ', correct/float(total))
- print('Crime found ratio: ', float(correctCrime)/totalCrime)
- print('falseNotCrime: ', falseNotCrime/float(total - correct))
- print('falseCrime: ', falseCrime/float(total - correct))
- def MakePrediction(modelPath, data):
- model = load_model(headlineModel)
- data = PrepareSet(data)
- return ['0' if pr[0] > pr[1] else '1' for pr in model.predict(data)]
- #dataSet = pd.read_json('news.json', lines = True)
- #df = pd.read_json('ProcessedDataSet.json')
- #df['link'] = dataSet['link']
- #df = df.iloc[0:60]
- #CreateModel(LabelAndNormalizeData(df, [0, 1500]), 'headlinesClissiferWithNormalizedData v.1.h5')
- #df['predicted'] = MakePrediction(headlineModel, df['text'].values)
- #print(df.head())
- texts = pd.read_json('DataForTextProcessing.json')
- texts = texts.astype({"labels": str})
- #learnData = LabelAndNormalizeData(dataSet, [0, 1000])
- #learnData['text'] = GetTexts(learnData['link'].values)
- #learnData = learnData.dropna()
- #learnData['ProcessedText'] = TextsGrammarPreProcessing(learnData['text'].values)
- #learnData.to_json('DataForTextProcessing.json')
- #CreateModel(learnData['ProcessedText'].values, learnData['labels'].values, 'textModel v.2.h5')
- #df['analysis'] = MakePrediction('textModel v.2.h5', GetTexts(df['link'].values))
- #df = df[['labels','predicted', 'analysis']]
- print('columns:')
- print(texts.columns.tolist())
- print(texts.dtypes)
- result = pd.DataFrame()
- result['label'] = texts['labels']
- result['headlinePr'] = MakePrediction(headlineModel, texts['headline'].values)
- result['textPr'] = MakePrediction('textModel.h5', texts['text'].values)
- print(result.head(5))
- totalCrime = 0
- correctHeadTotal = 0
- correctHeadCrime = 0
- textAndHeadCorrect = 0
- onlyHeadCorrect = 0
- textCorrect = 0
- corrrectCrimeText = 0
- correctCrimeHead = 0
- for index,row in result.iterrows():
- if row['label'] == '1':
- totalCrime += 1
- if row['label'] == row['textPr']:
- corrrectCrimeText +=1
- if row['label'] == row['headlinePr']:
- correctCrimeHead += 1
- if row['label'] == row['textPr']:
- textCorrect += 1
- if row['label'] == row['headlinePr']:
- correctHeadTotal += 1
- if row['label'] == '1':
- correctHeadCrime +=1
- if row['label'] == row['textPr']:
- textAndHeadCorrect += 1
- else:
- onlyHeadCorrect += 1
- print('correctHeadTotal:', float(correctHeadTotal) / len(result))
- print('textCorrectTotal:', float(textCorrect) / len(result))
- print('textAndHeadCorrect:', float(textAndHeadCorrect) / len(result))
- print('onlyHeadCorrect:', float(onlyHeadCorrect) / correctHeadTotal)
- print()
- print('corrrectCrimeText:', float(corrrectCrimeText)/totalCrime)
- print('correctHeadCrime:', float(correctCrimeHead)/totalCrime)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement