Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #The code below is for a textfile containing just one item. I am not sure how to tweak this to make it run for listfile.text (which contains raw data from multiple articles)
- with open('listfilereduced.txt', 'r', encoding='utf8') as my_file:
- rawData = my_file.read()
- print(rawData)
- #Separating body text from other data
- articleStart = rawData.find("<div class=\"story-element story-element-text\">")
- articleData = rawData[:articleStart]
- articleBody = rawData[articleStart:]
- print(articleData)
- print("*******")
- print(articleBody)
- print("*******")
- #First, I define a function to strip tags from the body text
- def stripTags(pageContents):
- insideTag = 0
- text = ''
- for char in pageContents:
- if char == '<':
- insideTag = 1
- elif (insideTag == 1 and char == '>'):
- insideTag = 0
- elif insideTag == 1:
- continue
- else:
- text += char
- return text
- #Calling the function
- articleBodyText = stripTags(articleBody)
- print(articleBodyText)
- ##Isolating article title and publication date
- TitleEndLoc = articleData.find("</h1>")
- dateStartLoc = articleData.find("<div class=\"storyPageMetaData-m__publish-time__19bdV\">")
- dateEndLoc=articleData.find("<div class=\"meta-data-icons storyPageMetaDataIcons-m__icons__3E4Xg\">")
- titleString = articleData[:TitleEndLoc]
- dateString = articleData[dateStartLoc:dateEndLoc]
- ##Call stripTags to clean
- articleTitle= stripTags(titleString)
- articleDate = stripTags(dateString)
- print(articleTitle)
- print(articleDate)
- #Cleaning the date a bit more
- startLocDate = articleDate.find(":")
- endLocDate = articleDate.find(",")
- articleDateClean = articleDate[startLocDate+2:endLocDate]
- print(articleDateClean)
- #save all this data to a dictionary that saves the title, data and the body text
- PAloTextDict = {"Title": articleTitle, "Date": articleDateClean, "Text": articleBodyText}
- print(PAloTextDict)
- #Normalize text by:
- #1. Splitting paragraphs of text into lists of words
- articleBodyWordList = articleBodyText.split()
- print(articleBodyWordList)
- #2.Removing punctuation and stopwords
- from bnlp.corpus import stopwords, punctuations
- #A. Remove punctuation first
- listNoPunct = []
- for word in articleBodyWordList:
- for mark in punctuations:
- word=word.replace(mark, '')
- listNoPunct.append(word)
- print(listNoPunct)
- #B. removing stopwords
- banglastopwords = stopwords()
- print(banglastopwords)
- cleanList=[]
- for word in listNoPunct:
- if word in banglastopwords:
- continue
- else:
- cleanList.append(word)
- print(cleanList)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement