Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # Configuration options
- case_sensitive = 1 # 1 for true, 0 for false
- # remove line if it contains any of these strings
- remove_contains = ['invalid url', 'UNKNOWN']
- # minimum line length to keep (lines shorter than this will be removed, 0 = no minimum)
- min_line_length = 1
- # maximum line length to keep (lines longer than this will be removed, 0 = no maximum)
- max_line_length = 0 # 0 means no limit
- # enable verbose output
- verbose = 1 # 1 for true, 0 for false
- def remove_duplicate_lines(input_file, output_file=None):
- """
- Removes duplicate lines and lines containing specified strings from a text file
- while preserving the original order.
- Args:
- input_file (str): Path to the input file
- output_file (str, optional): Path to the output file. If None, will create a .clean file.
- Returns:
- tuple: (duplicates_removed, filtered_out, total_kept)
- """
- if output_file is None:
- output_file = input_file + ".clean"
- # Statistics counters
- duplicates_count = 0
- filtered_out = 0
- total_lines = 0
- # Use a set to track seen lines while preserving order
- seen_lines = set()
- unique_lines = []
- try:
- # Read all lines from the file with error handling for encoding issues
- with open(input_file, 'r', encoding='utf-8', errors='replace') as file:
- for line in file:
- total_lines += 1
- # Remove trailing whitespace
- line_content = line.rstrip()
- # Skip lines that are too short (if min_line_length > 0)
- if min_line_length > 0 and len(line_content) < min_line_length:
- filtered_out += 1
- continue
- # Skip lines that are too long (if max_line_length > 0)
- if max_line_length > 0 and len(line_content) > max_line_length:
- filtered_out += 1
- continue
- # Skip lines containing any of the strings in remove_contains
- should_remove = False
- for text in remove_contains:
- if text in line_content:
- should_remove = True
- break
- if should_remove:
- filtered_out += 1
- continue
- # Handle case sensitivity
- if not case_sensitive:
- comparison_key = line_content.lower()
- else:
- comparison_key = line_content
- # Check if we've seen this line before
- if comparison_key not in seen_lines:
- seen_lines.add(comparison_key)
- unique_lines.append(line)
- else:
- duplicates_count += 1
- # Write unique lines to the output file
- with open(output_file, 'w', encoding='utf-8') as file:
- file.writelines(unique_lines)
- return (duplicates_count, filtered_out, len(unique_lines))
- except UnicodeDecodeError:
- # If UTF-8 fails, try with latin-1 which can handle any byte value
- seen_lines.clear()
- unique_lines.clear()
- duplicates_count = 0
- filtered_out = 0
- total_lines = 0
- with open(input_file, 'r', encoding='latin-1', errors='replace') as file:
- for line in file:
- total_lines += 1
- # Remove trailing whitespace
- line_content = line.rstrip()
- # Skip lines that are too short (if min_line_length > 0)
- if min_line_length > 0 and len(line_content) < min_line_length:
- filtered_out += 1
- continue
- # Skip lines that are too long (if max_line_length > 0)
- if max_line_length > 0 and len(line_content) > max_line_length:
- filtered_out += 1
- continue
- # Skip lines containing any of the strings in remove_contains
- should_remove = False
- for text in remove_contains:
- if text in line_content:
- should_remove = True
- break
- if should_remove:
- filtered_out += 1
- continue
- # Handle case sensitivity
- if not case_sensitive:
- comparison_key = line_content.lower()
- else:
- comparison_key = line_content
- # Check if we've seen this line before
- if comparison_key not in seen_lines:
- seen_lines.add(comparison_key)
- unique_lines.append(line)
- else:
- duplicates_count += 1
- # Write unique lines to the output file
- with open(output_file, 'w', encoding='latin-1') as file:
- file.writelines(unique_lines)
- return (duplicates_count, filtered_out, len(unique_lines))
- if __name__ == "__main__":
- import sys
- import os
- if len(sys.argv) < 2:
- print("Usage: python remove_duplicates.py <input_file> [output_file]")
- sys.exit(1)
- input_file = sys.argv[1]
- output_file = sys.argv[2] if len(sys.argv) > 2 else None
- if not os.path.exists(input_file):
- print(f"Error: Input file '{input_file}' not found.")
- sys.exit(1)
- try:
- duplicates, filtered, kept = remove_duplicate_lines(input_file, output_file)
- if verbose:
- print(f"File processing complete:")
- print(f"- Original file: {input_file}")
- print(f"- Output file: {output_file if output_file else input_file + '.clean'}")
- print(f"- Duplicate lines removed: {duplicates}")
- print(f"- Lines filtered by criteria: {filtered}")
- print(f"- Lines kept: {kept}")
- # Print filter settings
- print("\nFilter settings used:")
- print(f"- Case sensitive: {'Yes' if case_sensitive else 'No'}")
- print(f"- Min line length: {min_line_length if min_line_length > 0 else 'No minimum'}")
- print(f"- Max line length: {max_line_length if max_line_length > 0 else 'No maximum'}")
- print(f"- Filtered strings: {', '.join(f'"{s}"' for s in remove_contains) if remove_contains else 'None'}")
- else:
- print(f"Removed {duplicates} duplicates, filtered {filtered} lines, kept {kept} lines.")
- except Exception as e:
- print(f"Error: {e}")
- sys.exit(1)
Add Comment
Please, Sign In to add comment