Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def validate_by_counts(tweet, frequent_keys, keywords, recipients, partials, rules):
- """
- validating function.
- Look for counts of recipients, keywords, frequent keys and mark it as eligible if it passes one of
- provided rules (eg TEST_VALIDATION_RULES). If all rules not passed, tweet marked as ineligible.
- Currently, this is mostly used for our long-tail tweets job in which we find a bunch of text
- from high-confidence tweets, web comments, and look at the most frequent words and phrases.
- We then pull another set of tweets from twitter which might be on topic, and use our
- various added frequent keywords from the high-confidence tweets/web comments (eligible ones
- from our verified/unveried web search and comment scraping/fetching) to validate this new set of tweets.
- """
- def _get_counts(txt, keywords, recipients, partials, frequent_keys):
- """
- Function count recipients, keywords and frequent keys on text. Return dictionary with counts
- """
- # init counts dictionary
- counts = {'keywords': [], 'keyphrases': [], 'recipients': [],
- 'high_freq_words': [], 'medium_freq_words': [], 'low_freq_words': [],
- 'high_freq_phrases': [], 'medium_freq_phrases': [], 'low_freq_phrases': [],}
- # remove punctuation from text
- txt = remove_punctuation(txt).lower()
- if not txt:
- return counts
- # init keys dictionary for search. Here we use "counts" fields and will it with appropriate keys list
- # later we will loop on that dictionary and use dictionary key as key for "counts" dictionary
- # and value(appropriate keys list) for lookup on text
- keys_lists = {'keywords': [kw for kw in keywords if ' ' not in kw and '-' not in kw],
- 'keyphrases': [kw for kw in keywords if ' ' in kw or '-' in kw],
- 'high_freq_words': frequent_keys.get('top_terms_words_high', []),
- 'medium_freq_words': frequent_keys.get('top_terms_words_medium', []),
- 'low_freq_words': frequent_keys.get('top_terms_words_low', []),
- 'high_freq_phrases': frequent_keys.get('top_terms_phrases_high', []),
- 'medium_freq_phrases': frequent_keys.get('top_terms_phrases_medium', []),
- 'low_freq_phrases': frequent_keys.get('top_terms_phrases_low', []),
- }
- for destination, keys in keys_lists.iteritems():
- # if keywords provided as dictionaries need to retrieve words
- if keys and isinstance(keys[0], dict):
- keys = [k['word'] for k in keys]
- # sort keys by number of words, take copy to not change original list order
- keys = sorted(keys[:], key=lambda x: len(x.split()), reverse=True)
- # remove punctuation from keys as well
- keys = [remove_punctuation(k).lower() for k in keys]
- for key in keys:
- if key in txt:
- # if we already have that phrase in previously found phrases - skip it
- for found in counts[destination]:
- if key in found:
- break
- else:
- # append only if phrase not contained in previously found phrases
- counts[destination].append(key)
- _log.debug("Before processing, counts = {0}".format(counts))
- # Get all words from counts dictionary
- all_words = [item for sublist in counts.values()
- for item in sublist]
- _log.debug("all_words = {0}".format(all_words))
- # Remove repeating words (substrings)
- sorted_list = sorted(all_words, key=len, reverse=True)
- sifted_words = []
- for s in sorted_list:
- if not any([s in o for o in sifted_words]):
- sifted_words.append(s)
- _log.debug("sifted_words = {0}".format(sifted_words))
- # Create new dictionary taking into account only sifted words
- counts = {k: [i for i in v if i in sifted_words]
- for k, v in counts.iteritems()}
- _log.debug("After processing, counts = {0}".format(counts))
- # count recipients
- found_recipients = []
- for r in recipients:
- # collect all terms for lookup firstly
- lookup_names = set()
- partials_for_check = [part['name'].lower() for part in partials
- if part['name'].lower() in r['name'].lower() and part['name'] not in stopwords]
- lookup_names.update(partials_for_check)
- # checking same values in found recipients
- have_same_recipient = False
- for found_r in found_recipients:
- if 'fb' in found_r and 'fb' in r and found_r['fb'].lower() == r['fb'].lower():
- have_same_recipient = True
- break
- if 'tw' in found_r and 'tw' in r and found_r['tw'].lower() == r['tw'].lower():
- have_same_recipient = True
- break
- found_redirects = set(found_r.get('redirect_names', []))
- r_redirects = set(r.get('redirect_names', []))
- if found_redirects & r_redirects:
- have_same_recipient = True
- break
- found_partials = [p['name'].lower() for p in partials
- if p['name'].lower() in found_r['name'].lower() and p['name'] not in stopwords]
- if set(found_partials) & set(partials_for_check):
- have_same_recipient = True
- break
- if have_same_recipient:
- continue
- if r['name'] not in stopwords:
- lookup_names.add(r['name'].lower())
- if 'fb' in r:
- lookup_names.add(r['fb'].lower())
- if 'tw' in r:
- lookup_names.add(r['tw'].lower())
- redirect_names = r.get('redirect_names', [])
- lookup_names.update([rn.lower() for rn in redirect_names if rn not in stopwords])
- for name in lookup_names:
- if name in txt:
- for found in counts['recipients']:
- if name in found or found in name:
- break
- else:
- # if we don't found any matches in previously found names we append new entry
- counts['recipients'].append(name)
- found_recipients.append(r)
- break
- return counts
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement