Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Drop columns we don't need
- data = data.drop(['date', 'tweet_size', 'mention'], axis=1)
- # Subset data set for faster training
- # Choose all positive and all negative samples
- positive = data[data['sentiment'] == 1]
- negative = data[data['sentiment'] == 0]
- # Choose 5% of positives and 5% of negatives
- positive = positive.sample(frac=0.05)
- negative = negative.sample(frac=0.05)
- # Merge both datasets
- reduced_set = pd.concat([positive, negative])
- # Shuffle data
- reduced_set = reduced_set.reindex(np.random.permutation(reduced_set.index))
- reduced_set.head(5)
- # Split into train and test
- X, y = reduced_set.drop(['sentiment'], axis=1), reduced_set['sentiment']
- X_train, X_test, y_train, y_test = train_test_split(X, y,
- test_size=0.33,
- random_state=0)
Add Comment
Please, Sign In to add comment