Try95th

CSV Links Added to Python [edited copy]

Dec 11th, 2022 (edited)
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import os
  2. import pandas as pd
  3. import re
  4. from bs4 import BeautifulSoup
  5. from bs4 import element as bs4_element
  6. from google.colab import drive
  7. import csv
  8. drive.mount("/content/gdrive")
  9.  
  10. os.chdir("/content/gdrive/My Drive/Colab Notebooks/Fix WordPress Links")
  11.  
  12.  
  13. html_doc = """
  14. <!-- wp:paragraph -->
  15. <p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
  16. <!-- /wp:paragraph -->
  17.  
  18. <!-- wp:paragraph -->
  19. <p>This is another Google Text</p>
  20. <!-- /wp:paragraph -->
  21.  
  22. <!-- wp:paragraph -->
  23. <p>This is another lowercase bing Text</p>
  24. <!-- /wp:paragraph -->
  25.  
  26. <!-- wp:paragraph -->
  27. <p>This is another multi word Active Campaign Text</p>
  28. <!-- /wp:paragraph -->
  29. """
  30.  
  31. soup = BeautifulSoup(html_doc, 'html.parser')
  32.  
  33. # read the CSV file with anchor text and hyperlinks
  34. with open('file.csv', 'r') as csv_file:
  35.   reader = csv.reader(csv_file)
  36.   hyperlinks = dict(reader)
  37. # hyperlinks = {k:v for k, v in pd.read_html(
  38. #         'https://stackoverflow.com/q/74699446/6146136'
  39. # )[0].to_dict('split')['data']}
  40.  
  41. # from bs4 import element as bs4_element
  42. be_navStr = bs4_element.NavigableString
  43.  
  44. hList = [
  45.     (anchor_text.strip(), hyperlink.strip()) for
  46.     anchor_text, hyperlink in hyperlinks.items()
  47.     if anchor_text.strip() and hyperlink.strip() # no blanks
  48. ]
  49.  
  50.  
  51. for txt, link in hList:
  52.     navStrs = [
  53.         d for d in soup.descendants if type(d) == be_navStr
  54.         # and f' {txt.lower()} ' in f' {d.text.strip().lower()} ' # same as
  55.         and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
  56.         # and (' '+txt.lower()+' ') in (' '+d.text.strip().lower()+' ')
  57.     ]
  58.     for ns in navStrs:
  59.         # tLen, remStr = len(txt), f' {ns.get_text().strip()} '
  60.         tLen, remStr = len(txt), f' {ns.string.strip()} '
  61.         # tLen, remStr = len(txt), f' {ns.text.strip()} '
  62.         if remStr[1:-1].lower() == txt.lower():
  63.             # to skip if it's already a hyperlink
  64.             if ns.parent.name == 'a':
  65.                 # ns.parent['href'] = link # if you want to replace/update link
  66.                 continue
  67.  
  68.         while f' {txt.lower()} ' in remStr.lower():
  69.             sInd = remStr.lower().find(f' {txt.lower()} ') + 1
  70.  
  71.             hlTag = soup.new_tag('a', href=link)
  72.             hlTag.append(remStr[sInd:sInd + tLen])
  73.  
  74.             newCont = [remStr[:sInd].lstrip(), hlTag, ' ']
  75.             for addn in newCont: ns.insert_before(addn)
  76.  
  77.             ### PREVIOUSLY MISSING LINES BELOW [sorryyy -_-] ###
  78.             remStr = f' {remStr[sInd + tLen:].strip()} '
  79.         ns.replace_with(remStr.strip())
  80.  
  81. print(soup)
Advertisement
Add Comment
Please, Sign In to add comment