Advertisement
tivaliy

Untitled

Feb 19th, 2018
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.66 KB | None | 0 0
  1. def validate_by_counts(tweet, frequent_keys, keywords, recipients, partials, rules):
  2.     """
  3.    validating function.
  4.    Look for counts of recipients, keywords, frequent keys and mark it as eligible if it passes one of
  5.    provided rules (eg TEST_VALIDATION_RULES). If all rules not passed, tweet marked as ineligible.
  6.    Currently, this is mostly used for our long-tail tweets job in which we find a bunch of text
  7.    from high-confidence tweets, web comments, and look at the most frequent words and phrases.
  8.    We then pull another set of tweets from twitter which might be on topic, and use our
  9.    various added frequent keywords from the high-confidence tweets/web comments (eligible ones
  10.    from our verified/unveried web search and comment scraping/fetching) to validate this new set of tweets.
  11.    """
  12.     def _get_counts(txt, keywords, recipients, partials, frequent_keys):
  13.         """
  14.        Function count recipients, keywords and frequent keys on text. Return dictionary with counts
  15.        """
  16.         # init counts dictionary
  17.         counts = {'keywords': [], 'keyphrases': [], 'recipients': [],
  18.                   'high_freq_words': [], 'medium_freq_words': [], 'low_freq_words': [],
  19.                   'high_freq_phrases': [], 'medium_freq_phrases': [], 'low_freq_phrases': [],}
  20.         # remove punctuation from text
  21.         txt = remove_punctuation(txt).lower()
  22.         if not txt:
  23.             return counts
  24.         # init keys dictionary for search. Here we use "counts" fields and will it with appropriate keys list
  25.         # later we will loop on that dictionary and use dictionary key as key for "counts" dictionary
  26.         # and value(appropriate keys list) for lookup on text
  27.         keys_lists = {'keywords': [kw for kw in keywords if ' ' not in kw and '-' not in kw],
  28.                       'keyphrases': [kw for kw in keywords if ' ' in kw or '-' in kw],
  29.                       'high_freq_words': frequent_keys.get('top_terms_words_high', []),
  30.                       'medium_freq_words': frequent_keys.get('top_terms_words_medium', []),
  31.                       'low_freq_words': frequent_keys.get('top_terms_words_low', []),
  32.                       'high_freq_phrases': frequent_keys.get('top_terms_phrases_high', []),
  33.                       'medium_freq_phrases': frequent_keys.get('top_terms_phrases_medium', []),
  34.                       'low_freq_phrases': frequent_keys.get('top_terms_phrases_low', []),
  35.                       }
  36.         for destination, keys in keys_lists.iteritems():
  37.             # if keywords provided as dictionaries need to retrieve words
  38.             if keys and isinstance(keys[0], dict):
  39.                 keys = [k['word'] for k in keys]
  40.             # sort keys by number of words, take copy to not change original list order
  41.             keys = sorted(keys[:], key=lambda x: len(x.split()), reverse=True)
  42.             # remove punctuation from keys as well
  43.             keys = [remove_punctuation(k).lower() for k in keys]
  44.  
  45.             for key in keys:
  46.                 if key in txt:
  47.                     # if we already have that phrase in previously found phrases - skip it
  48.                     for found in counts[destination]:
  49.                         if key in found:
  50.                             break
  51.                     else:
  52.                         # append only if phrase not contained in previously found phrases
  53.                         counts[destination].append(key)
  54.  
  55.         _log.debug("Before processing, counts = {0}".format(counts))
  56.         # Get all words from counts dictionary
  57.         all_words = [item for sublist in counts.values()
  58.                      for item in sublist]
  59.         _log.debug("all_words = {0}".format(all_words))
  60.         # Remove repeating words (substrings)
  61.         sorted_list = sorted(all_words, key=len, reverse=True)
  62.         sifted_words = []
  63.         for s in sorted_list:
  64.             if not any([s in o for o in sifted_words]):
  65.                 sifted_words.append(s)
  66.         _log.debug("sifted_words = {0}".format(sifted_words))
  67.         # Create new dictionary taking into account only sifted words
  68.         counts = {k: [i for i in v if i in sifted_words]
  69.                   for k, v in counts.iteritems()}
  70.         _log.debug("After processing, counts = {0}".format(counts))
  71.  
  72.         # count recipients
  73.         found_recipients = []
  74.         for r in recipients:
  75.             # collect all terms for lookup firstly
  76.             lookup_names = set()
  77.             partials_for_check = [part['name'].lower() for part in partials
  78.                                   if part['name'].lower() in r['name'].lower() and part['name'] not in stopwords]
  79.             lookup_names.update(partials_for_check)
  80.             # checking same values in found recipients
  81.             have_same_recipient = False
  82.             for found_r in found_recipients:
  83.                 if 'fb' in found_r and 'fb' in r and found_r['fb'].lower() == r['fb'].lower():
  84.                     have_same_recipient = True
  85.                     break
  86.                 if 'tw' in found_r and 'tw' in r and found_r['tw'].lower() == r['tw'].lower():
  87.                     have_same_recipient = True
  88.                     break
  89.                 found_redirects = set(found_r.get('redirect_names', []))
  90.                 r_redirects = set(r.get('redirect_names', []))
  91.                 if found_redirects & r_redirects:
  92.                     have_same_recipient = True
  93.                     break
  94.                 found_partials = [p['name'].lower() for p in partials
  95.                                   if p['name'].lower() in found_r['name'].lower() and p['name'] not in stopwords]
  96.                 if set(found_partials) & set(partials_for_check):
  97.                     have_same_recipient = True
  98.                     break
  99.             if have_same_recipient:
  100.                 continue
  101.             if r['name'] not in stopwords:
  102.                 lookup_names.add(r['name'].lower())
  103.  
  104.             if 'fb' in r:
  105.                 lookup_names.add(r['fb'].lower())
  106.             if 'tw' in r:
  107.                 lookup_names.add(r['tw'].lower())
  108.  
  109.             redirect_names = r.get('redirect_names', [])
  110.             lookup_names.update([rn.lower() for rn in redirect_names if rn not in stopwords])
  111.  
  112.             for name in lookup_names:
  113.                 if name in txt:
  114.                     for found in counts['recipients']:
  115.                         if name in found or found in name:
  116.                             break
  117.                     else:
  118.                         # if we don't found any matches in previously found names we append new entry
  119.                         counts['recipients'].append(name)
  120.                         found_recipients.append(r)
  121.                     break
  122.         return counts
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement