Untitled

"""
Exclude CDHIT Clusters that contains number of sequences below a certain threshold
"""


import sys

args = sys.argv

if len(args) < 3:
    exit("run: python cdhit_filter.py <input_file> <threshold>")

else:
    ORIGINAL_FILE = args[1]
    NEW_FILE = "filtered_" + ORIGINAL_FILE
    THRESHOLD = int(args[2])

cluster_buffer = ""

with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
    first_line = next(cdhit) # Add the first line to the buffer
    cluster_buffer += first_line

    # Iterate over each line
    for line in cdhit:
        # Check if line starts with ">" that means a new cluster started, so analyse the previous one
        if line[0] == ">":
            # Count how many "at" in the cluster
            count = cluster_buffer.count("at")  + 1

            # Check if the count more than the predefined threshold
            # If yes, write it to the new file
            if count >= THRESHOLD:
                filtered.write(cluster_buffer)

            # Empty the cluster_buffer for buffering the new cluster
            cluster_buffer = ""
            cluster_buffer = line

        else:
            cluster_buffer += line