Advertisement
Guest User

Untitled

a guest
Jun 20th, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.14 KB | None | 0 0
  1. """
  2. Exclude CDHIT Clusters that contains number of sequences below a certain threshold
  3. """
  4.  
  5.  
  6. import sys
  7.  
  8. args = sys.argv
  9.  
  10. if len(args) < 3:
  11. exit("run: python cdhit_filter.py <input_file> <threshold>")
  12.  
  13. else:
  14. ORIGINAL_FILE = args[1]
  15. NEW_FILE = "filtered_" + ORIGINAL_FILE
  16. THRESHOLD = int(args[2])
  17.  
  18. cluster_buffer = ""
  19.  
  20. with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
  21. first_line = next(cdhit) # Add the first line to the buffer
  22. cluster_buffer += first_line
  23.  
  24. # Iterate over each line
  25. for line in cdhit:
  26. # Check if line starts with ">" that means a new cluster started, so analyse the previous one
  27. if line[0] == ">":
  28. # Count how many "at" in the cluster
  29. count = cluster_buffer.count("at") + 1
  30.  
  31. # Check if the count more than the predefined threshold
  32. # If yes, write it to the new file
  33. if count >= THRESHOLD:
  34. filtered.write(cluster_buffer)
  35.  
  36. # Empty the cluster_buffer for buffering the new cluster
  37. cluster_buffer = ""
  38. cluster_buffer = line
  39.  
  40. else:
  41. cluster_buffer += line
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement