Advertisement
Guest User

Untitled

a guest
Jul 16th, 2019
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.94 KB | None | 0 0
  1. def binary_labels(df, min_set=None):
  2.  
  3. # Remove outliers
  4. count = df['review_text'].str.len()
  5. df = df[(count > 3) | (count < 4000)].copy()
  6.  
  7. # Equilize dataset sample size
  8. if min_set is None:
  9. min_set = df['rating'].value_counts().min() * 4
  10.  
  11. # Split rating 1.0, 2.0 as negative reviews
  12. rating_1 = df[df['rating'] == 1.0].iloc[:min_set // 4]
  13. rating_2 = df[df['rating'] == 2.0].iloc[:min_set // 4]
  14. negative_reviews = pd.concat([rating_1, rating_2])
  15. negative_reviews['label'] = 0
  16.  
  17. # Split rating 4.0, 5.0 as positive reviews
  18. rating_4 = df[df['rating'] == 4.0].iloc[:min_set // 4]
  19. rating_5 = df[df['rating'] == 5.0].iloc[:min_set // 4]
  20. positive_reviews = pd.concat([rating_4, rating_5])
  21. positive_reviews['label'] = 1
  22.  
  23. dataset = pd.concat([negative_reviews, positive_reviews])
  24. dataset.sort_index(inplace=True)
  25.  
  26. return dataset
  27.  
  28. dataset = binary_labels(df, min_set=100000)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement