SHARE
TWEET

john

a guest Jul 18th, 2019 63 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Apr 25 17:47:03 2019
  5.  
  6. @author: John
  7. """
  8.  
  9. ## READ ME
  10. ## HTML scraper that loads:
  11. ## https://genome.jgi.doe.gov/cgi-bin/ncbiBlastp?db={genome_portal}&id={proteinID}
  12. ## and extracts protein sequence from the submission field.
  13. ## Genome_portal and proteinID are provided by the user, e.g. genome_portal = "Boled1" and proteinID = 165333
  14.  
  15.  
  16. # Import libraries
  17. import os
  18. import requests
  19. from bs4 import BeautifulSoup
  20. import pandas as pd
  21. import numpy as np
  22.  
  23.  
  24. # Retrieve current working directory (`cwd`)
  25. cwd = os.getcwd()
  26.  
  27. # Change directory
  28. os.chdir("/Users/John/Documents/NPS James/PythonScraping/test")
  29.  
  30. # Load input spreadsheet containing genome portal and protein ID
  31. df = pd.read_excel("NPS_input.xlsx",sheet_name="Fungi",header=0,converters={'proteinID':str,'genome_portal':str})
  32.  
  33. # Drop NA
  34. drop = np.where(df['proteinID'].isnull())[0]
  35. df = df.drop(drop, axis = 0)
  36.  
  37. # Reindex dataframe
  38. df.index = pd.RangeIndex(len(df.index))
  39.  
  40. # Create new dataframe to store scraped data
  41. df2 = pd.DataFrame(columns = ["genome_portal","proteinID","proteinSeq","URL"])
  42.  
  43. # Loop over df to scrape protein sequence for each NPS protein ID
  44. for i in range(len(df.index)):
  45.    
  46.     genome_portal = df.genome_portal[i]
  47.     proteinID = df.proteinID[i]
  48.     print(genome_portal + " " + proteinID)
  49.    
  50.     # Create URL
  51.     URL = f"https://genome.jgi.doe.gov/cgi-bin/ncbiBlastp?db={genome_portal}&id={proteinID}"
  52.    
  53.     # Request URL
  54.     page = requests.get(URL)
  55.    
  56.     # Create a BeautifulSoup object
  57.     soup = BeautifulSoup(page.text, 'html.parser')
  58.    
  59.     # Extract protein sequence
  60.     proteinSeq=soup.find('textarea').text
  61.    
  62.     # Store the data in df2
  63.     df2.at[i, "genome_portal"] = df.genome_portal[i]
  64.     df2.at[i, "proteinID"] = df.proteinID[i]
  65.     df2.at[i, "proteinSeq"] = proteinSeq
  66.     df2.at[i, "URL"] = URL
  67.  
  68.  
  69. # write new fasta file
  70. ofile = open("NPS_proteinSeq.txt", "w")
  71.  
  72. for i in range(len(df2.index)):
  73.  
  74.     ofile.write(">" + df2.genome_portal[i] + "\n" + df2.proteinSeq[i] + "\n")
  75.  
  76.     #do not forget to close it
  77.  
  78. ofile.close()
  79.  
  80.  
  81. # write to excel
  82. df2.to_excel("NPS_proteinSeq.xlsx")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top