Remove Duplicate Lines

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Configuration options
case_sensitive = 1  # 1 for true, 0 for false
# remove line if it contains any of these strings
remove_contains = ['invalid url', 'UNKNOWN']
# minimum line length to keep (lines shorter than this will be removed, 0 = no minimum)
min_line_length = 1
# maximum line length to keep (lines longer than this will be removed, 0 = no maximum)
max_line_length = 0  # 0 means no limit
# enable verbose output
verbose = 1  # 1 for true, 0 for false

def remove_duplicate_lines(input_file, output_file=None):
    """
    Removes duplicate lines and lines containing specified strings from a text file
    while preserving the original order.

    Args:
        input_file (str): Path to the input file
        output_file (str, optional): Path to the output file. If None, will create a .clean file.

    Returns:
        tuple: (duplicates_removed, filtered_out, total_kept)
    """
    if output_file is None:
        output_file = input_file + ".clean"

    # Statistics counters
    duplicates_count = 0
    filtered_out = 0
    total_lines = 0

    # Use a set to track seen lines while preserving order
    seen_lines = set()
    unique_lines = []

    try:
        # Read all lines from the file with error handling for encoding issues
        with open(input_file, 'r', encoding='utf-8', errors='replace') as file:
            for line in file:
                total_lines += 1

                # Remove trailing whitespace
                line_content = line.rstrip()

                # Skip lines that are too short (if min_line_length > 0)
                if min_line_length > 0 and len(line_content) < min_line_length:
                    filtered_out += 1
                    continue

                # Skip lines that are too long (if max_line_length > 0)
                if max_line_length > 0 and len(line_content) > max_line_length:
                    filtered_out += 1
                    continue

                # Skip lines containing any of the strings in remove_contains
                should_remove = False
                for text in remove_contains:
                    if text in line_content:
                        should_remove = True
                        break

                if should_remove:
                    filtered_out += 1
                    continue

                # Handle case sensitivity
                if not case_sensitive:
                    comparison_key = line_content.lower()
                else:
                    comparison_key = line_content

                # Check if we've seen this line before
                if comparison_key not in seen_lines:
                    seen_lines.add(comparison_key)
                    unique_lines.append(line)
                else:
                    duplicates_count += 1

        # Write unique lines to the output file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.writelines(unique_lines)

        return (duplicates_count, filtered_out, len(unique_lines))

    except UnicodeDecodeError:
        # If UTF-8 fails, try with latin-1 which can handle any byte value
        seen_lines.clear()
        unique_lines.clear()
        duplicates_count = 0
        filtered_out = 0
        total_lines = 0

        with open(input_file, 'r', encoding='latin-1', errors='replace') as file:
            for line in file:
                total_lines += 1

                # Remove trailing whitespace
                line_content = line.rstrip()

                # Skip lines that are too short (if min_line_length > 0)
                if min_line_length > 0 and len(line_content) < min_line_length:
                    filtered_out += 1
                    continue

                # Skip lines that are too long (if max_line_length > 0)
                if max_line_length > 0 and len(line_content) > max_line_length:
                    filtered_out += 1
                    continue

                # Skip lines containing any of the strings in remove_contains
                should_remove = False
                for text in remove_contains:
                    if text in line_content:
                        should_remove = True
                        break

                if should_remove:
                    filtered_out += 1
                    continue

                # Handle case sensitivity
                if not case_sensitive:
                    comparison_key = line_content.lower()
                else:
                    comparison_key = line_content

                # Check if we've seen this line before
                if comparison_key not in seen_lines:
                    seen_lines.add(comparison_key)
                    unique_lines.append(line)
                else:
                    duplicates_count += 1

        # Write unique lines to the output file
        with open(output_file, 'w', encoding='latin-1') as file:
            file.writelines(unique_lines)

        return (duplicates_count, filtered_out, len(unique_lines))

if __name__ == "__main__":
    import sys
    import os

    if len(sys.argv) < 2:
        print("Usage: python remove_duplicates.py <input_file> [output_file]")
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else None

    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found.")
        sys.exit(1)

    try:
        duplicates, filtered, kept = remove_duplicate_lines(input_file, output_file)

        if verbose:
            print(f"File processing complete:")
            print(f"- Original file: {input_file}")
            print(f"- Output file: {output_file if output_file else input_file + '.clean'}")
            print(f"- Duplicate lines removed: {duplicates}")
            print(f"- Lines filtered by criteria: {filtered}")
            print(f"- Lines kept: {kept}")

            # Print filter settings
            print("\nFilter settings used:")
            print(f"- Case sensitive: {'Yes' if case_sensitive else 'No'}")
            print(f"- Min line length: {min_line_length if min_line_length > 0 else 'No minimum'}")
            print(f"- Max line length: {max_line_length if max_line_length > 0 else 'No maximum'}")
            print(f"- Filtered strings: {', '.join(f'"{s}"' for s in remove_contains) if remove_contains else 'None'}")
        else:
            print(f"Removed {duplicates} duplicates, filtered {filtered} lines, kept {kept} lines.")

    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)