SHARE
TWEET

Untitled

a guest Jun 20th, 2019 59 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. """
  2. Exclude CDHIT Clusters that contains number of sequences below a certain threshold
  3. """
  4.  
  5.  
  6. import sys
  7.  
  8. args = sys.argv
  9.  
  10. if len(args) < 3:
  11.     exit("run: python cdhit_filter.py <input_file> <threshold>")
  12.  
  13. else:
  14.     ORIGINAL_FILE = args[1]
  15.     NEW_FILE = "filtered_" + ORIGINAL_FILE
  16.     THRESHOLD = int(args[2])
  17.  
  18. cluster_buffer = ""
  19.  
  20. with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
  21.     first_line = next(cdhit) # Add the first line to the buffer
  22.     cluster_buffer += first_line
  23.  
  24.     # Iterate over each line
  25.     for line in cdhit:
  26.         # Check if line starts with ">" that means a new cluster started, so analyse the previous one
  27.         if line[0] == ">":
  28.             # Count how many "at" in the cluster
  29.             count = cluster_buffer.count("at")  + 1
  30.  
  31.             # Check if the count more than the predefined threshold
  32.             # If yes, write it to the new file
  33.             if count >= THRESHOLD:
  34.                 filtered.write(cluster_buffer)
  35.  
  36.             # Empty the cluster_buffer for buffering the new cluster
  37.             cluster_buffer = ""
  38.             cluster_buffer = line
  39.  
  40.         else:
  41.             cluster_buffer += line
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top