Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def binary_labels(df, min_set=None):
- # Remove outliers
- count = df['review_text'].str.len()
- df = df[(count > 3) | (count < 4000)].copy()
- # Equilize dataset sample size
- if min_set is None:
- min_set = df['rating'].value_counts().min() * 4
- # Split rating 1.0, 2.0 as negative reviews
- rating_1 = df[df['rating'] == 1.0].iloc[:min_set // 4]
- rating_2 = df[df['rating'] == 2.0].iloc[:min_set // 4]
- negative_reviews = pd.concat([rating_1, rating_2])
- negative_reviews['label'] = 0
- # Split rating 4.0, 5.0 as positive reviews
- rating_4 = df[df['rating'] == 4.0].iloc[:min_set // 4]
- rating_5 = df[df['rating'] == 5.0].iloc[:min_set // 4]
- positive_reviews = pd.concat([rating_4, rating_5])
- positive_reviews['label'] = 1
- dataset = pd.concat([negative_reviews, positive_reviews])
- dataset.sort_index(inplace=True)
- return dataset
- dataset = binary_labels(df, min_set=100000)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement