lolipop12

[SNZ] Наоѓање на клучни зборови со тф-идф

Jan 9th, 2020
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.56 KB | None | 0 0
  1. import math
  2. import re
  3.  
  4.  
  5. def get_words(doc):
  6.     """Поделба на документот на зборови. Стрингот се дели на зборови според
  7.    празните места и интерпукциските знаци
  8.  
  9.    :param doc: документ
  10.    :type doc: str
  11.    :return: множество со зборовите кои се појавуваат во дадениот документ
  12.    :rtype: set(str)
  13.    """
  14.     # подели го документот на зборови и конвертирај ги во мали букви
  15.     # па потоа стави ги во резултатот ако нивната должина е >2 и <20
  16.     words = list()
  17.     for word in re.split('\\W+', doc):
  18.         if 2 < len(word) < 20:
  19.             words.append(word.lower())
  20.     return words
  21.  
  22.  
  23. def get_vocabulary(documents):
  24.     """Враќа множество од сите зборови кои се појавуваат во документите.
  25.  
  26.    :param documents: листа со документи
  27.    :type documents: list(str)
  28.    :return: множество зборови
  29.    :rtype: set(str)
  30.    """
  31.     vocab = set()
  32.     for doc_text in documents:
  33.         words = get_words(doc_text)
  34.         words_set = set(words)
  35.         vocab.update(words_set)
  36.     return sorted(vocab)
  37.  
  38.  
  39. def cosine(v1, v2):
  40.     """Ја враќа косинусната сличност помеѓу два вектори v1 и v2.
  41.  
  42.    :param v1: вектор1
  43.    :type v1: list(float)
  44.    :param v2: вектор2
  45.    :type v2: list(float)
  46.    :return: сличност помеѓу вектор и вектор2
  47.    :rtype: float
  48.    """
  49.     sumxx, sumxy, sumyy = 0, 0, 0
  50.     for i in range(len(v1)):
  51.         x = v1[i]
  52.         y = v2[i]
  53.         sumxx += x * x
  54.         sumyy += y * y
  55.         sumxy += x * y
  56.     return sumxy / math.sqrt(sumxx * sumyy)
  57.  
  58.  
  59. def pearson(v1, v2):
  60.     """ Го враќа коефициентот на Пирсонова корелација помеѓу два вектори v1 и v2.
  61.  
  62.    :param v1: вектор1
  63.     :type v1: list(float)
  64.    :param v2: вектор2
  65.    :type v2: list(float)
  66.    :return: сличност помеѓу вектор и вектор2
  67.    :rtype: float
  68.    """
  69.     sum1 = 0
  70.     sum2 = 0
  71.     sum1Sq = 0
  72.     sum2Sq = 0
  73.     pSum = 0
  74.     n = len(v1)
  75.     for i in range(n):
  76.         x1 = v1[i]
  77.         x2 = v2[i]
  78.         sum1 += x1
  79.         sum1Sq += x1 ** 2
  80.         sum2 += x2
  81.         sum2Sq += x2 ** 2
  82.         pSum += x1 * x2
  83.     num = pSum - (sum1 * sum2 / n)
  84.     den = math.sqrt((sum1Sq - sum1 ** 2 / n) * (sum2Sq - sum2 ** 2 / n))
  85.     if den == 0: return 0
  86.     r = num / den
  87.     return r
  88.  
  89.  
  90. def calculate_document_frequencies(documents):
  91.     """Враќа речник со број на појавување на зборовите.
  92.  
  93.    :param documents: листа со документи
  94.    :type documents: list(str)
  95.    :return: речник со број на појавување на зборовите
  96.    :rtype: dict(str, int)
  97.    """
  98.     df = {}
  99.     documents_words = []
  100.     for doc_text in documents:
  101.         words = get_words(doc_text)
  102.         documents_words.append(words)
  103.         words_set = set(words)
  104.         for word in words_set:
  105.             df.setdefault(word, 0)
  106.             df[word] += 1
  107.     return df
  108.  
  109.  
  110. def calc_vector(cur_tf_idf, vocab):
  111.     """Пресметува tf-idf вектор за даден документ од дадениот вокабулар.
  112.  
  113.    :param cur_tf_idf: речник со tf-idf тежини
  114.    :type cur_tf_idf: dict(str, float)
  115.    :param vocab: множество од сите зборови кои се појавуваат во барем еден документ
  116.    :type vocab: set(str)
  117.    :return: tf-idf вектор за дадениот документ
  118.    """
  119.     vec = []
  120.     for word in vocab:
  121.         tf_idf = cur_tf_idf.get(word, 0)
  122.         vec.append(tf_idf)
  123.     return vec
  124.  
  125.  
  126. def process_document(doc, df, N, vocab):
  127.     """Пресметува tf-idf за даден документ.
  128.  
  129.    :param doc: документ
  130.    :type doc: str
  131.    :param df: речник со фреквенции на зборовите во дадениот документ
  132.    :type df: dict(str, int)
  133.    :param N: вкупен број на документи
  134.    :param vocab: множество од сите зборови кои се појавуваат во барем еден документ
  135.    :type vocab: set(str)
  136.    :return: tf-idf вектор за дадениот документ
  137.    """
  138.     if isinstance(doc, str):
  139.         words = get_words(doc)
  140.     else:
  141.         words = doc
  142.     idf = {}
  143.     for word, cdf in df.items():
  144.         idf[word] = math.log(N / cdf)
  145.     f = {}  # колку пати се јавува секој збор во овој документ
  146.     for word in words:
  147.         f.setdefault(word, 0)
  148.         f[word] += 1
  149.     max_f = max(f.values())  # колку пати се појавува најчестиот збор во овој документ
  150.     tf_idf = {}
  151.     for word, cnt in f.items():
  152.         ctf = cnt * 1.0 / max_f
  153.         tf_idf[word] = ctf * idf.get(word, 0)
  154.     vec = calc_vector(tf_idf, vocab)
  155.     return vec
  156.  
  157.  
  158. def rank_documents(doc, documents, sim_func=cosine):
  159.     """Враќа најслични документи со дадениот документ.
  160.  
  161.    :param doc: документ
  162.    :type doc: str
  163.    :param documents: листа со документи
  164.    :type documents: list(str)
  165.    :param sim_func: функција за сличност
  166.    :return: листа со најслични документи
  167.    """
  168.     df = calculate_document_frequencies(documents)
  169.     N = len(documents)
  170.     vocab = get_vocabulary(documents)
  171.     doc_vectors = []
  172.     for document in documents:
  173.         vec = process_document(document, df, N, vocab)
  174.         doc_vectors.append(vec)
  175.     query_vec = process_document(doc, df, N, vocab)
  176.     similarities = []
  177.     for i, doc_vec in enumerate(doc_vectors):
  178.         dist = sim_func(query_vec, doc_vec)
  179.         similarities.append((dist, i))
  180.     similarities.sort(reverse=True)
  181.     return similarities
  182.  
  183.  
  184. def create_dataset(documents, labels):
  185.     """Формира податочно множество со tf-idf тежини и класи, соодветно за класификација со дрва на одлука.
  186.  
  187.    :param documents: листа со документи
  188.    :type documents: list(str)
  189.    :param labels: листа со класи
  190.    :type labels: list
  191.    :return: податочно множество со tf-idf тежини и класи, речник со френвенции на појавување на зборовите,
  192.            број на документи во множеството, вокабулар од даденото множество на аборови
  193.    :rtype: list(list), dict(str, int), int, set(word)
  194.    """
  195.     dataset = []
  196.     doc_vectors = []
  197.     df = calculate_document_frequencies(documents)
  198.     N = len(documents)
  199.     vocab = get_vocabulary(documents)
  200.     for document in documents:
  201.         vec = process_document(document, df, N, vocab)
  202.         doc_vectors.append(vec)
  203.     for doc_vec, label in zip(doc_vectors, labels):
  204.         doc_vec.append(label)
  205.         dataset.append(doc_vec)
  206.     return dataset, df, N, vocab
  207.  
  208. data = [
  209.     ("""I like Rhythm and Blue music.""", 'formal'),
  210.     ("""Back in my day Emo was a comedian :/""", 'informal'),
  211.     ("""Why sit and listen to Locke, Jack, or Syead?""", 'informal'),
  212.     ("""There's nothing he needs to change.""", 'formal'),
  213.     ("""It does not exist.""", 'formal'),
  214.     ("""I like when the Prime Minister goes door to door to find the girl!""", 'informal'),
  215.     ("""Mine is book by Steve Martin called 'The Pleasure of my Company'.""", 'formal'),
  216.     ("""What differentiates a mosquitoo from a blonde?""", 'formal'),
  217.     ("""They're pretty good. Also, that's a good song.""", 'formal'),
  218.     ("""And every time I hear that song I get butterflies in my stomach!""", 'informal'),
  219.     ("""It's the biggest load of crap I've seen for ages.""", 'informal'),
  220.     ("""I do not think Beyonce can sing, dance, or act. You mentioned Rihanna, who is that?""", 'formal'),
  221.     ("""as i lay dying is far far away from christ definitaly!""", 'informal'),
  222.     ("""I was unaware that you were in law enforcement, as well.""", 'formal'),
  223.     ("""I might be seeing them in a few months!""", 'informal'),
  224.     ("""I called to say 'I Love You""", 'formal'),
  225.     ("""that´s why they needed to open that hatch so much!""", 'informal'),
  226.     (
  227.         """I would most likely not vote for him, although I believe Melania would be the most attractive First Lady in our country's history.""",
  228.         'formal'),
  229.     ("""I do not hate him.""", 'formal'),
  230.     ("""He's supposed to be in jail!""", 'informal'),
  231.     ("""i thought that she did an outstanding job in the movie""", 'informal'),
  232.     ("""Nicole Kidman, I love her eyes""", 'informal'),
  233.     ("""Youtube.com also features many of the current funny ads.""", 'formal'),
  234.     ("""I enjoy watching my companion attempt to role-play with them.""", 'formal'),
  235.     ("""omg i love that song im listening to it right now""", 'informal'),
  236.     ("""Some of my favorite television series are Monk, The Dukes of Hazzard, Miami Vice, and The Simpsons.""",
  237.      'formal'),
  238.     ("""I have a desire to produce videos on Full Metal Alchemist.""", 'formal'),
  239.     ("""tell him you want a 3 way with another hot girl""", 'informal'),
  240.     (
  241.         """I would travel to that location and physically assault you at this very moment, however, I am unable to swim.""",
  242.         'formal'),
  243.     ("""No, no, no that was WITNESS...""", 'informal'),
  244.     ("""aneways shonenjump.com is cool and yeah narutos awsum""", 'informal'),
  245.     (
  246.         """Your mother is so unintelligent that she was hit by a cup and told the police that she was mugged.""",
  247.         'formal'),
  248.     ("""You must be creative and find something to challange us.""", 'formal'),
  249.     ("""i think they would have, quite a shame isn't it""", 'informal'),
  250.     ("""I am watching it right now.""", 'formal'),
  251.     ("""I do not know; the person who invented the names had attention deficit disorder.""", 'formal'),
  252.     ("""im a huge green day fan!!!!!""", 'informal'),
  253.     ("""I believe, rather, that they are not very smart on this topic.""", 'formal'),
  254.     ("""Of course it is Oprah, because she has been providing better advice for a longer time.""", 'formal'),
  255.     ("""Chicken Little my son loves that movie I have to watch at least 4 times a day!""", 'informal'),
  256.     ("""That is the key point, that you fell asleep.""", 'formal'),
  257.     ("""A brunette female, a blonde, and person with red hair walked down a street.""", 'formal'),
  258.     ("""who is your best bet for american idol season five""", 'informal'),
  259.     ("""That is funny.  Girls need to be a part of everything.""", 'formal'),
  260.     ("""In point of fact, Chris's performance looked like the encoure performed at a Genesis concert.""", 'formal'),
  261.     ("""In my time, Emo was a comedian.""", 'formal'),
  262.     ("""my age gas prices and my blood pressure  LOL""", 'informal'),
  263.     ("""Moriarty and so forth, but what character did the Peruvian actor portray?""", 'formal'),
  264.     ("""What did the beaver say to the log?""", 'formal'),
  265.     ("""Where in the world do you come up with these questions????""", 'informal'),
  266.     ("""even though i also agree that the girls on Love Hina are pretty scrumptious""", 'informal'),
  267.     ("""I miss Aaliyah, she was a great singer.""", 'formal'),
  268.     ("""and the blond says Great they already put me on my first murder mystery case""", 'informal'),
  269. ]
  270.  
  271.  
  272. if __name__ == '__main__':
  273.     threshold = float(input())
  274.     sentences = list(map(int, input().split(',')))
  275.     documents=[]
  276.     for x in data:
  277.         documents.append(x[0])
  278.        
  279.     df=calculate_document_frequencies(documents)
  280.     N=len(documents)
  281.     vocab=get_vocabulary(documents)
  282.    
  283.     for i in sentences:
  284.         sent=documents[i]
  285.         td_idf=process_document(sent,df,N,vocab)
  286.         pom=[]
  287.         for j in range(len(vocab)):
  288.             val=td_idf[j]
  289.             if val>threshold:
  290.                 pom.append((vocab[j],val))
  291.         if len(pom)==0:
  292.             print(sent[0], 'No keywords...')
  293.         keywords = list(sorted(pom, key=lambda x: x[1], reverse=True))[:5]
  294.         print_str = ''
  295.         for keyword in keywords:
  296.             print_str += keyword[0] + ': ' + str(keyword[1]) + ', '
  297.             print(sent[i], '->', print_str[:-2])
Add Comment
Please, Sign In to add comment