Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Exclude CDHIT Clusters that contains number of sequences below a certain threshold
- """
- import sys
- args = sys.argv
- if len(args) < 3:
- exit("run: python cdhit_filter.py <input_file> <threshold>")
- else:
- ORIGINAL_FILE = args[1]
- NEW_FILE = "filtered_" + ORIGINAL_FILE
- THRESHOLD = int(args[2])
- cluster_buffer = ""
- with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
- first_line = next(cdhit) # Add the first line to the buffer
- cluster_buffer += first_line
- # Iterate over each line
- for line in cdhit:
- # Check if line starts with ">" that means a new cluster started, so analyse the previous one
- if line[0] == ">":
- # Count how many "at" in the cluster
- count = cluster_buffer.count("at") + 1
- # Check if the count more than the predefined threshold
- # If yes, write it to the new file
- if count >= THRESHOLD:
- filtered.write(cluster_buffer)
- # Empty the cluster_buffer for buffering the new cluster
- cluster_buffer = ""
- cluster_buffer = line
- else:
- cluster_buffer += line
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement