Untitled

import pandas as pd
import json
import os
#import np
#text pre-procassing modules
import re
from autocorrect import spell
from nltk.corpus import stopwords, words
from nltk.corpus import wordnet as wn
import nltk
stop_words = set(stopwords.words('english'))

import keras.backend as K
import math
from newspaper import Article
#{"category": "CRIME", "headline": text, "authors": "Melissa Jeltsen", "link": adres, "short_description": text, "date": "2018-05-26"}
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers.merge import add
from keras import layers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


tokenizer = Tokenizer(num_words=300,  split=' ')
max_fatures = 2000
embed_dim = 128
lstm_out = 196
batch_size = 32

headlineModel = 'headlinesClissiferWithNormalizedData v.1.h5'
textModel = 'model.h5'

patterns =[
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[\w+]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'\d+', # numbers (Not sure should we delete it on this stage)
    r"[']\w+", # deleting all after ' (ex. cat's => cat)
    r"[:;=%x][o0\-^_]?[ds\\\[\]\(\)/i|><]+", # smiles
         ]

def ClearFromPatterns(str, patterns):
    result = str
    for pattern in patterns:
        result = re.sub(pattern, '', result)
    return result

def Split(text):
    regex = r'(\w*) '
    result=re.findall(regex,text)
    return result

def DeletePunctuation(text):
    return ' '.join([word for word in Split(text) if len(word) > 0])

def DeleteStopWords(words, stopWords):
    return [word for word in words if word not in stopWords]

def CorrectSpelling(words):
    text = [spell(word).lower() if len(word) > 3 else word for word in words]
    return text

def MorphyCorrection(words):
    res = []
    for word in words:
        newWord = wn.morphy(word) #Returns None if it cant change word
        if newWord:
            res.append(newWord)
        else:
            res.append(word)
    return res

def GrammarPreProcessing(text):
    text = text.lower()
    text = ClearFromPatterns(text, patterns)
    text = DeletePunctuation(text)
    words = text.split(' ')
    words = DeleteStopWords(words, stop_words)
    #Removed cause we are working with news data
    #words = CorrectSpelling(words)
    #words = MorphyCorrection(words)
    return ' '.join(words)

def TextsGrammarPreProcessing(texts):
    return [GrammarPreProcessing(text) for text in texts]

def GetTexts(links):
    total = len(links)
    texts = []
    for i in range(0, total):
        texts.append(GetText(links[i]))
        print(i + 1, 'of ', total, ' links processed(', round(float(i)/ total, 4) * 100, '%)')
    return texts

def GetText(link):
    try:
        article = Article(link)
        article.download()
        article.parse()
        return article.text
    except:
        return pd.NaT


def CreateModel(X, Y, path):
    tokenizer = Tokenizer(num_words=max_fatures,  split=' ')
    tokenizer.fit_on_texts(X)
    #data['text'] = [GrammarPreProcessing(row['text']) for index, row in data.iterrows()]
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen= 300)
    model = Sequential()
    model.add(layers.Embedding(max_fatures, embed_dim, input_length=300))
    model.add(layers.Dense(10, activation='relu'))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
    model.add(layers.Dense(2, activation='softmax'))
    model.compile(optimizer='adam',
                  loss= 'binary_crossentropy',
                  metrics=['accuracy'])
    Y = pd.get_dummies(Y).values
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
    batch_size = 32
    history = model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)
    scores = model.evaluate(X_test, Y_test)
    print(scores)
    model.save(path)

def PrepareSet(data, maxVectorLen = 300):
    tokenizer.fit_on_texts(data)
    X = tokenizer.texts_to_sequences(data)
    X = pad_sequences(X, maxlen= maxVectorLen)
    return X

def DeleteShortWords(text):
    return ' '.join([word for word in text.split() if len(word) > 1])

def GetFreq(texts):
    freq = nltk.FreqDist()
    for text in texts:
        text = GrammarPreProcessing(text)
        print(text)
        tokens = nltk.word_tokenize(text)
        bigrams = list(nltk.bigrams(tokens))
        freq.update(bigrams)
    return freq

def AnalyzeBigrams(crimeTexts, notCrimeTexts):
    print(notCrime.head())
    print('---------Crime---------')
    freq1 = GetFreq(crimeTexts)
    print(freq1.most_common(10))
    freq1.plot(20)
    print('---------Not crime---------')
    freq2 = GetFreq(notCrimeTexts)
    print(freq2.most_common(10))
    freq2.plot(20)
    retrun (freq1, freq2)

def LabelAndNormalizeData(df, dataRange):
    crime = df.loc[df.category == 'CRIME']
    notCrime = df.loc[df.category != 'CRIME']
    crime = crime.iloc[dataRange[0]:dataRange[1]]
    data = crime.append(notCrime.iloc[dataRange[0]:dataRange[1]])
    data['labels'] = ['1' if row['category'] == 'CRIME' else '0' for index,row in data.iterrows()]
    data = data.iloc[np.random.permutation(len(data))]
    return data

def PrintStatistic(data, column):
    print(data.head(20))
    correct = 0
    totalCrime = 0
    correctCrime = 0
    falseCrime = 0
    falseNotCrime = 0
    total = len(data)
    for index,row in data.iterrows():
        if row['labels'] == '1':
            totalCrime += 1
            if row[column] == '1':
                correctCrime += 1
        if row['labels'] == row[column]:
            correct += 1
        elif row[column] == '1':
            falseCrime += 1
        else:
            falseNotCrime += 1
    print('Data:')
    print(df.category.value_counts())
    print('total: ', correct/float(total))
    print('Crime found ratio: ', float(correctCrime)/totalCrime)
    print('falseNotCrime: ', falseNotCrime/float(total - correct))
    print('falseCrime: ', falseCrime/float(total - correct))


def MakePrediction(modelPath, data):
    model = load_model(headlineModel)
    data = PrepareSet(data)
    return ['0' if pr[0] > pr[1] else '1' for pr in model.predict(data)]


#dataSet = pd.read_json('news.json', lines = True)
#df = pd.read_json('ProcessedDataSet.json')
#df['link'] = dataSet['link']
#df = df.iloc[0:60]
#CreateModel(LabelAndNormalizeData(df, [0, 1500]), 'headlinesClissiferWithNormalizedData v.1.h5')
#df['predicted'] = MakePrediction(headlineModel, df['text'].values)
#print(df.head())
texts = pd.read_json('DataForTextProcessing.json')
texts = texts.astype({"labels": str})
#learnData = LabelAndNormalizeData(dataSet, [0, 1000])
#learnData['text'] = GetTexts(learnData['link'].values)
#learnData = learnData.dropna()
#learnData['ProcessedText'] = TextsGrammarPreProcessing(learnData['text'].values)
#learnData.to_json('DataForTextProcessing.json')
#CreateModel(learnData['ProcessedText'].values, learnData['labels'].values, 'textModel v.2.h5')
#df['analysis'] = MakePrediction('textModel v.2.h5', GetTexts(df['link'].values))
#df = df[['labels','predicted', 'analysis']]
print('columns:')
print(texts.columns.tolist())
print(texts.dtypes)
result = pd.DataFrame()
result['label'] = texts['labels']
result['headlinePr'] = MakePrediction(headlineModel, texts['headline'].values)
result['textPr'] = MakePrediction('textModel.h5', texts['text'].values)
print(result.head(5))

totalCrime = 0
correctHeadTotal = 0
correctHeadCrime = 0
textAndHeadCorrect = 0
onlyHeadCorrect = 0
textCorrect = 0
corrrectCrimeText = 0
correctCrimeHead = 0
for index,row in result.iterrows():
    if row['label'] == '1':
        totalCrime += 1
        if row['label'] == row['textPr']:
            corrrectCrimeText +=1
        if row['label'] == row['headlinePr']:
            correctCrimeHead += 1
    if row['label'] == row['textPr']:
        textCorrect += 1
    if row['label'] == row['headlinePr']:
        correctHeadTotal += 1
        if row['label'] == '1':
            correctHeadCrime +=1
        if row['label'] == row['textPr']:
            textAndHeadCorrect += 1
        else:
            onlyHeadCorrect += 1
print('correctHeadTotal:', float(correctHeadTotal) / len(result))
print('textCorrectTotal:', float(textCorrect) / len(result))
print('textAndHeadCorrect:', float(textAndHeadCorrect) / len(result))
print('onlyHeadCorrect:', float(onlyHeadCorrect) / correctHeadTotal)
print()
print('corrrectCrimeText:', float(corrrectCrimeText)/totalCrime)
print('correctHeadCrime:', float(correctCrimeHead)/totalCrime)