Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import pandas as pd
- import re
- from bs4 import BeautifulSoup
- from bs4 import element as bs4_element
- from google.colab import drive
- import csv
- drive.mount("/content/gdrive")
- os.chdir("/content/gdrive/My Drive/Colab Notebooks/Fix WordPress Links")
- html_doc = """
- <!-- wp:paragraph -->
- <p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
- <!-- /wp:paragraph -->
- <!-- wp:paragraph -->
- <p>This is another Google Text</p>
- <!-- /wp:paragraph -->
- <!-- wp:paragraph -->
- <p>This is another lowercase bing Text</p>
- <!-- /wp:paragraph -->
- <!-- wp:paragraph -->
- <p>This is another multi word Active Campaign Text</p>
- <!-- /wp:paragraph -->
- """
- soup = BeautifulSoup(html_doc, 'html.parser')
- # read the CSV file with anchor text and hyperlinks
- with open('file.csv', 'r') as csv_file:
- reader = csv.reader(csv_file)
- hyperlinks = dict(reader)
- # hyperlinks = {k:v for k, v in pd.read_html(
- # 'https://stackoverflow.com/q/74699446/6146136'
- # )[0].to_dict('split')['data']}
- # from bs4 import element as bs4_element
- be_navStr = bs4_element.NavigableString
- hList = [
- (anchor_text.strip(), hyperlink.strip()) for
- anchor_text, hyperlink in hyperlinks.items()
- if anchor_text.strip() and hyperlink.strip() # no blanks
- ]
- for txt, link in hList:
- navStrs = [
- d for d in soup.descendants if type(d) == be_navStr
- # and f' {txt.lower()} ' in f' {d.text.strip().lower()} ' # same as
- and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
- # and (' '+txt.lower()+' ') in (' '+d.text.strip().lower()+' ')
- ]
- for ns in navStrs:
- # tLen, remStr = len(txt), f' {ns.get_text().strip()} '
- tLen, remStr = len(txt), f' {ns.string.strip()} '
- # tLen, remStr = len(txt), f' {ns.text.strip()} '
- if remStr[1:-1].lower() == txt.lower():
- # to skip if it's already a hyperlink
- if ns.parent.name == 'a':
- # ns.parent['href'] = link # if you want to replace/update link
- continue
- while f' {txt.lower()} ' in remStr.lower():
- sInd = remStr.lower().find(f' {txt.lower()} ') + 1
- hlTag = soup.new_tag('a', href=link)
- hlTag.append(remStr[sInd:sInd + tLen])
- newCont = [remStr[:sInd].lstrip(), hlTag, ' ']
- for addn in newCont: ns.insert_before(addn)
- ### PREVIOUSLY MISSING LINES BELOW [sorryyy -_-] ###
- remStr = f' {remStr[sInd + tLen:].strip()} '
- ns.replace_with(remStr.strip())
- print(soup)
Advertisement
Add Comment
Please, Sign In to add comment