Advertisement
myloyo

coursework

May 17th, 2024
432
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.84 KB | None | 0 0
  1. import nltk
  2. from nltk.tokenize import sent_tokenize, word_tokenize
  3.  
  4. prim = "She’s swept the floor. She’s very kind to me."
  5. tok = sent_tokenize(prim)
  6. print(tok)
  7. for elem in tok:
  8.     temp = word_tokenize(elem)
  9.     print(temp)
  10.  
  11. nltk.download('punkt')
  12. nltk.download('averaged_perceptron_tagger')
  13. # nltk.download()
  14. # СЛОВАРИ
  15.  
  16. DictWith1Meaning = {
  17.     "arencha": "are not you",
  18.     "cuz": "because",
  19.     "cuppa": "cup of",
  20.     "dunno": "do not know",
  21.     "finna": "fixing to",
  22.     "gimme": "give me",
  23.     "gonna": "going to",
  24.     "gotta": "got to",
  25.     "helluva": "hell of a",
  26.     "howdy": "how do you do",
  27.     "hafta": "have to",
  28.     "Imma": "I am going to",
  29.     "innit": "is not it",
  30.     "Ion": "I do not",
  31.     "kinda": "kind of",
  32.     "lemme": "let me",
  33.     "methinks": "I think",
  34.     "tryna": "trying to",
  35.     "wanna": "want to",
  36.     "whatcha": "what are you",
  37.     "wonnot": "will not",
  38.     "yessir": "yes sir",
  39.     "a’ight": "alright",
  40.     "amn’t": "am not",
  41.     "’n’": "and",
  42.     "‘n’": "and",
  43.     "aren’t": "are not",
  44.     "’bout": "about",
  45.     "cap’n": "captain",
  46.     "can’t": "cannot",
  47.     "’cause": "because",
  48.     "’cept": "except",
  49.     "c’mon": "come on",
  50.     "could’ve": "could have",
  51.     "couldn’t": "could not",
  52.     "couldn’t’ve": "could not have",
  53.     "daresn’t": "dare not",
  54.     "dasn’t": "dare not",
  55.     "didn’t": "did not",
  56.     "doesn't": "does not",
  57.     "e’en": "even",
  58.     "e’er": "ever",
  59.     "’em": "them",
  60.     "fo’c’sle": "forecastle",
  61.     "’gainst": "against",
  62.     "g’day": "good day",
  63.     "giv’n": "given",
  64.     "gi’z": "give us",
  65.     "gon’t": "go not",
  66.     "hadn’t": "had not",
  67.     "had’ve": "had have",
  68.     "hasn’t": "has not",
  69.     "haven’t": "have not",
  70.     "here’s": "here is",
  71.     "how’re": "how are",
  72.     "if’n": "If and when",
  73.     "I'd've": "I would have",
  74.     "I’m": "I am",
  75.     "I’m'onna": "I am going to",
  76.     "I’m’o": "I am going to",
  77.     "I’m'na": "I am going to",
  78.     "I’ve": "I have",
  79.     "isn’t": "is not",
  80.     "it’d": "it would",
  81.     "let’s": "let us",
  82.     "loven’t": "love not",
  83.     "ma’am": "madam",
  84.     "mayn’t": "may not",
  85.     "may’ve": "may have",
  86.     "mightn’t": "might not",
  87.     "might’ve": "might have",
  88.     "mine’s": "mine is",
  89.     "mustn’t": "must not",
  90.     "mustn’t’ve": "must not have",
  91.     "must’ve": "must have",
  92.     "’neath": "beneath",
  93.     "needn’t": "need not",
  94.     "nal": "and all",
  95.     "ne’er": "never",
  96.     "o’": "of",
  97.     "o’clock": "of the clock",
  98.     "o’er": "over",
  99.     "ol’": "old",
  100.     "ought’ve": "ought have",
  101.     "oughtn’t": "ought not",
  102.     "oughtn’t’ve": "ought not have",
  103.     "’round": "around",
  104.     "shalln’t": "shall not",
  105.     "shan’": "shall not",
  106.     "shan’t": "shall not",
  107.     "should’ve": "should have",
  108.     "shouldn’t": "should not",
  109.     "shouldn’t’ve": "should not have",
  110.     "so’re": "so are",
  111.     "so’ve": "so have",
  112.     "that’re": "that are",
  113.     "there’re": "there are",
  114.     "these’re": "these are",
  115.     "these’ve": "these have",
  116.     "they’ve": "they have",
  117.     "those’re ": "those are",
  118.     "those’ve": "those have",
  119.     "’thout": "without",
  120.     "’til": "until",
  121.     "’tis": "it is",
  122.     "’tisn’t": "it is not",
  123.     "to’ve": "to have",
  124.     "’twas": "it was",
  125.     "’tween": "between",
  126.     "’twere": "it were",
  127.     "w’all": "we all",
  128.     "w’at": "we at",
  129.     "wasn’t": "was not",
  130.     "we’d’ve": "we would have",
  131.     "we’re": "we are",
  132.     "we’ve": "we have",
  133.     "weren’t": "were not",
  134.     "what’d": "what did",
  135.     "what’ve": "what have",
  136.     "when’d": "when did",
  137.     "where’d": "where did",
  138.     "where’re": "where are",
  139.     "where’ve": "where have",
  140.     "which’re": "which are",
  141.     "which’ve": "which have",
  142.     "who’re": "who are",
  143.     "who’ve": "who have",
  144.     "why’d": "why did",
  145.     "why’re": "why are",
  146.     "willn’t": "will not",
  147.     "won’t": "will not",
  148.     "would’ve": "would have",
  149.     "wouldn’t": "would not",
  150.     "wouldn’t’ve": "would not have",
  151.     "y’at": "you at",
  152.     "y'ever": "have you ever",
  153.     "y’know": "you know",
  154.     "you’re": "you are",
  155.     "you’ve": "you have",
  156.     "y’all": "you all",
  157.     "y’all’d’ve": "you all would have",
  158.     "y’all’dn't’ve": "you all would not have",
  159.     "y’all’re": "you all are",
  160.     "y’all’ren’t": "you all are not",
  161.     "yes’m": "yes madam",
  162.     "who’d’ve": "who would have",
  163.     "yesn’t": "yes not"
  164. }
  165.  
  166. DictOfFormal = {
  167.     "’re": "are",
  168.     "’ll": "will",
  169.     "’ight": "alright",
  170.     "’t": "not",
  171.     "’ve": "have"
  172. }
  173.  
  174. # чтение из входного файла текста для обработки
  175. filename = "in_text_contractions.txt"
  176. arr = []                                                                    # добавить обработку с сохранением абзацных отступов!!!
  177. with open("in_text_contractions.txt", "r", encoding='utf-8') as file:
  178.     content = file.read()
  179.     # arr.append(content)
  180.  
  181.  
  182.  
  183. #prim = "You’re very tall."
  184. #prim1 = "She’s swept the floor."
  185. #prim2 = "She’s very kind to me."
  186. #tok = word_tokenize(prim1 +" "+ prim2)
  187. #print(tok)
  188.  
  189. # tagged = nltk.pos_tag(tok)
  190.  
  191. #print(tagged)
  192.  
  193. #print()
  194.  
  195.  
  196. # Сначала обработка неформальных сокращений без апострофа
  197. text = content.split(" ")
  198. for i in range(0, len(text)):
  199.     temp = text[i]
  200.     f = False   # флаг, была ли буква заглавной или нет
  201.     if temp[0].isupper():
  202.         f = True
  203.         temp[0].lower()
  204.     if temp in DictWith1Meaning:
  205.         temp = DictWith1Meaning[temp]
  206.     if f:
  207.         temp[0].upper()
  208.         f = False
  209.     text[i] = temp
  210.  
  211. z = ' '.join(text)
  212.  
  213. # Обрабатываем формальные сокращения
  214. sents = sent_tokenize(z)
  215. for i in range(0, len(sents)):
  216.     s = sents[i]
  217.     s = word_tokenize(s)
  218.     for j in range(0, len(s) - 1):
  219.         if s[j] == "’":
  220.             if s[j] + s[j + 1] in DictOfFormal:
  221.                 s[j] = DictOfFormal[s[j] + s[j + 1]]
  222.                 s.remove(s[j + 1])
  223.                 j += 1
  224.     # здесь надо создать новую строку из того, что обработалось
  225.     temp = ""
  226.     c = ".’?!:;)]}«„„'"
  227.     for x in range(0, len(s)):
  228.         if s[x] in c:
  229.             temp = temp[:-1]
  230.             temp += s[x]
  231.         elif s[x] == ',' or s[x] == '”' or s[x] == '»':
  232.             temp = temp[:-1]
  233.             temp += s[x] + " "
  234.         else:
  235.             temp += s[x] + " "
  236.     sents[i] = temp
  237.  
  238. # Соединяем текст обратно
  239. FullText = ""
  240. for i in range(0, len(sents)):
  241.     FullText += sents[i] + ' '
  242.  
  243. print(FullText)
  244.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement