Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- import nltk
- from nltk.corpus import stopwords as nltk_stopwords
- from sklearn.linear_model import LogisticRegression
- try:
- train = pd.read_csv('/tweets_lemm_train.csv')
- test = pd.read_csv('/tweets_lemm_test.csv')
- except:
- train = pd.read_csv('/datasets/tweets_lemm_train.csv')
- test = pd.read_csv('/datasets/tweets_lemm_test.csv')
- X_train = train['lemm_text'].values.astype('U')
- y_train = train['positive']
- X_test = test['lemm_text'].values.astype('U')
- nltk.download('stopwords')
- stopwords = set(nltk_stopwords.words('russian'))
- count_tf_idf = TfidfVectorizer(stop_words=stopwords)
- tf_idf_model = count_tf_idf.fit_transform(X_train)
- train_tfidf = count_tf_idf.transform(X_train)
- test_tfidf = count_tf_idf.transform(X_test)
- model = LogisticRegression()
- model.fit(train_tfidf, y_train)
- predictions = pd.DataFrame(model.predict(test_tfidf))
- predictions.columns = ['positive']
- predictions.to_csv('predictions')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement