Advertisement
sbmonzur

CleaningText

Mar 9th, 2021
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.61 KB | None | 0 0
  1. #The code below is for a textfile containing just one item. I am not sure how to tweak this to make it run for listfile.text (which contains raw data from multiple articles)
  2.  
  3. with open('listfilereduced.txt', 'r', encoding='utf8') as my_file:
  4.     rawData = my_file.read()
  5.     print(rawData)
  6.  
  7. #Separating body text from other data
  8.  
  9. articleStart = rawData.find("<div class=\"story-element story-element-text\">")
  10. articleData = rawData[:articleStart]
  11. articleBody = rawData[articleStart:]
  12. print(articleData)
  13. print("*******")
  14. print(articleBody)
  15. print("*******")
  16.  
  17. #First, I define a function to strip tags from the body text
  18.  
  19. def stripTags(pageContents):
  20.     insideTag = 0
  21.     text = ''
  22.  
  23.     for char in pageContents:
  24.         if char == '<':
  25.             insideTag = 1
  26.         elif (insideTag == 1 and char == '>'):
  27.             insideTag = 0
  28.         elif insideTag == 1:
  29.             continue
  30.         else:
  31.             text += char
  32.     return text
  33.  
  34. #Calling the function
  35. articleBodyText = stripTags(articleBody)
  36. print(articleBodyText)
  37.  
  38. ##Isolating article title and publication date
  39.  
  40. TitleEndLoc = articleData.find("</h1>")
  41. dateStartLoc = articleData.find("<div class=\"storyPageMetaData-m__publish-time__19bdV\">")
  42. dateEndLoc=articleData.find("<div class=\"meta-data-icons storyPageMetaDataIcons-m__icons__3E4Xg\">")
  43. titleString = articleData[:TitleEndLoc]
  44. dateString = articleData[dateStartLoc:dateEndLoc]
  45.  
  46.  
  47. ##Call stripTags to clean
  48. articleTitle= stripTags(titleString)
  49. articleDate = stripTags(dateString)
  50.  
  51. print(articleTitle)
  52. print(articleDate)
  53.  
  54. #Cleaning the date a bit more
  55. startLocDate = articleDate.find(":")
  56. endLocDate = articleDate.find(",")
  57. articleDateClean = articleDate[startLocDate+2:endLocDate]
  58. print(articleDateClean)
  59.  
  60. #save all this data to a dictionary that saves the title, data and the body text
  61. PAloTextDict = {"Title": articleTitle, "Date": articleDateClean, "Text": articleBodyText}
  62. print(PAloTextDict)
  63.  
  64. #Normalize text by:
  65.  
  66. #1. Splitting paragraphs of text into lists of words
  67.  
  68. articleBodyWordList = articleBodyText.split()
  69. print(articleBodyWordList)
  70.  
  71. #2.Removing punctuation and stopwords
  72.  
  73. from bnlp.corpus import stopwords, punctuations
  74.  
  75. #A. Remove punctuation first
  76.  
  77. listNoPunct = []
  78.  
  79. for word in articleBodyWordList:
  80.     for mark in punctuations:
  81.         word=word.replace(mark, '')
  82.     listNoPunct.append(word)
  83. print(listNoPunct)
  84.  
  85.  
  86. #B. removing stopwords
  87. banglastopwords = stopwords()
  88. print(banglastopwords)
  89.  
  90. cleanList=[]
  91. for word in listNoPunct:
  92.     if word in banglastopwords:
  93.         continue
  94.     else:
  95.         cleanList.append(word)
  96. print(cleanList)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement