Untitled

def validate_by_counts(tweet, frequent_keys, keywords, recipients, partials, rules):
    """
    validating function.
    Look for counts of recipients, keywords, frequent keys and mark it as eligible if it passes one of
    provided rules (eg TEST_VALIDATION_RULES). If all rules not passed, tweet marked as ineligible.
    Currently, this is mostly used for our long-tail tweets job in which we find a bunch of text
    from high-confidence tweets, web comments, and look at the most frequent words and phrases.
    We then pull another set of tweets from twitter which might be on topic, and use our
    various added frequent keywords from the high-confidence tweets/web comments (eligible ones
    from our verified/unveried web search and comment scraping/fetching) to validate this new set of tweets.
    """
    def _get_counts(txt, keywords, recipients, partials, frequent_keys):
        """
        Function count recipients, keywords and frequent keys on text. Return dictionary with counts
        """
        # init counts dictionary
        counts = {'keywords': [], 'keyphrases': [], 'recipients': [],
                  'high_freq_words': [], 'medium_freq_words': [], 'low_freq_words': [],
                  'high_freq_phrases': [], 'medium_freq_phrases': [], 'low_freq_phrases': [],}
        # remove punctuation from text
        txt = remove_punctuation(txt).lower()
        if not txt:
            return counts
        # init keys dictionary for search. Here we use "counts" fields and will it with appropriate keys list
        # later we will loop on that dictionary and use dictionary key as key for "counts" dictionary
        # and value(appropriate keys list) for lookup on text
        keys_lists = {'keywords': [kw for kw in keywords if ' ' not in kw and '-' not in kw],
                      'keyphrases': [kw for kw in keywords if ' ' in kw or '-' in kw],
                      'high_freq_words': frequent_keys.get('top_terms_words_high', []),
                      'medium_freq_words': frequent_keys.get('top_terms_words_medium', []),
                      'low_freq_words': frequent_keys.get('top_terms_words_low', []),
                      'high_freq_phrases': frequent_keys.get('top_terms_phrases_high', []),
                      'medium_freq_phrases': frequent_keys.get('top_terms_phrases_medium', []),
                      'low_freq_phrases': frequent_keys.get('top_terms_phrases_low', []),
                      }
        for destination, keys in keys_lists.iteritems():
            # if keywords provided as dictionaries need to retrieve words
            if keys and isinstance(keys[0], dict):
                keys = [k['word'] for k in keys]
            # sort keys by number of words, take copy to not change original list order
            keys = sorted(keys[:], key=lambda x: len(x.split()), reverse=True)
            # remove punctuation from keys as well
            keys = [remove_punctuation(k).lower() for k in keys]

            for key in keys:
                if key in txt:
                    # if we already have that phrase in previously found phrases - skip it
                    for found in counts[destination]:
                        if key in found:
                            break
                    else:
                        # append only if phrase not contained in previously found phrases
                        counts[destination].append(key)

        _log.debug("Before processing, counts = {0}".format(counts))
        # Get all words from counts dictionary
        all_words = [item for sublist in counts.values()
                     for item in sublist]
        _log.debug("all_words = {0}".format(all_words))
        # Remove repeating words (substrings)
        sorted_list = sorted(all_words, key=len, reverse=True)
        sifted_words = []
        for s in sorted_list:
            if not any([s in o for o in sifted_words]):
                sifted_words.append(s)
        _log.debug("sifted_words = {0}".format(sifted_words))
        # Create new dictionary taking into account only sifted words
        counts = {k: [i for i in v if i in sifted_words]
                  for k, v in counts.iteritems()}
        _log.debug("After processing, counts = {0}".format(counts))

        # count recipients
        found_recipients = []
        for r in recipients:
            # collect all terms for lookup firstly
            lookup_names = set()
            partials_for_check = [part['name'].lower() for part in partials
                                  if part['name'].lower() in r['name'].lower() and part['name'] not in stopwords]
            lookup_names.update(partials_for_check)
            # checking same values in found recipients
            have_same_recipient = False
            for found_r in found_recipients:
                if 'fb' in found_r and 'fb' in r and found_r['fb'].lower() == r['fb'].lower():
                    have_same_recipient = True
                    break
                if 'tw' in found_r and 'tw' in r and found_r['tw'].lower() == r['tw'].lower():
                    have_same_recipient = True
                    break
                found_redirects = set(found_r.get('redirect_names', []))
                r_redirects = set(r.get('redirect_names', []))
                if found_redirects & r_redirects:
                    have_same_recipient = True
                    break
                found_partials = [p['name'].lower() for p in partials
                                  if p['name'].lower() in found_r['name'].lower() and p['name'] not in stopwords]
                if set(found_partials) & set(partials_for_check):
                    have_same_recipient = True
                    break
            if have_same_recipient:
                continue
            if r['name'] not in stopwords:
                lookup_names.add(r['name'].lower())

            if 'fb' in r:
                lookup_names.add(r['fb'].lower())
            if 'tw' in r:
                lookup_names.add(r['tw'].lower())

            redirect_names = r.get('redirect_names', [])
            lookup_names.update([rn.lower() for rn in redirect_names if rn not in stopwords])

            for name in lookup_names:
                if name in txt:
                    for found in counts['recipients']:
                        if name in found or found in name:
                            break
                    else:
                        # if we don't found any matches in previously found names we append new entry
                        counts['recipients'].append(name)
                        found_recipients.append(r)
                    break
        return counts