ToKeiChun

Remove Duplicate Lines

May 1st, 2025
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.96 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. # Configuration options
  5. case_sensitive = 1  # 1 for true, 0 for false
  6. # remove line if it contains any of these strings
  7. remove_contains = ['invalid url', 'UNKNOWN']
  8. # minimum line length to keep (lines shorter than this will be removed, 0 = no minimum)
  9. min_line_length = 1
  10. # maximum line length to keep (lines longer than this will be removed, 0 = no maximum)
  11. max_line_length = 0  # 0 means no limit
  12. # enable verbose output
  13. verbose = 1  # 1 for true, 0 for false
  14.  
  15. def remove_duplicate_lines(input_file, output_file=None):
  16.     """
  17.    Removes duplicate lines and lines containing specified strings from a text file
  18.    while preserving the original order.
  19.    
  20.    Args:
  21.        input_file (str): Path to the input file
  22.        output_file (str, optional): Path to the output file. If None, will create a .clean file.
  23.    
  24.    Returns:
  25.        tuple: (duplicates_removed, filtered_out, total_kept)
  26.    """
  27.     if output_file is None:
  28.         output_file = input_file + ".clean"
  29.    
  30.     # Statistics counters
  31.     duplicates_count = 0
  32.     filtered_out = 0
  33.     total_lines = 0
  34.    
  35.     # Use a set to track seen lines while preserving order
  36.     seen_lines = set()
  37.     unique_lines = []
  38.    
  39.     try:
  40.         # Read all lines from the file with error handling for encoding issues
  41.         with open(input_file, 'r', encoding='utf-8', errors='replace') as file:
  42.             for line in file:
  43.                 total_lines += 1
  44.                
  45.                 # Remove trailing whitespace
  46.                 line_content = line.rstrip()
  47.                
  48.                 # Skip lines that are too short (if min_line_length > 0)
  49.                 if min_line_length > 0 and len(line_content) < min_line_length:
  50.                     filtered_out += 1
  51.                     continue
  52.                
  53.                 # Skip lines that are too long (if max_line_length > 0)
  54.                 if max_line_length > 0 and len(line_content) > max_line_length:
  55.                     filtered_out += 1
  56.                     continue
  57.                
  58.                 # Skip lines containing any of the strings in remove_contains
  59.                 should_remove = False
  60.                 for text in remove_contains:
  61.                     if text in line_content:
  62.                         should_remove = True
  63.                         break
  64.                
  65.                 if should_remove:
  66.                     filtered_out += 1
  67.                     continue
  68.                
  69.                 # Handle case sensitivity
  70.                 if not case_sensitive:
  71.                     comparison_key = line_content.lower()
  72.                 else:
  73.                     comparison_key = line_content
  74.                
  75.                 # Check if we've seen this line before
  76.                 if comparison_key not in seen_lines:
  77.                     seen_lines.add(comparison_key)
  78.                     unique_lines.append(line)
  79.                 else:
  80.                     duplicates_count += 1
  81.        
  82.         # Write unique lines to the output file
  83.         with open(output_file, 'w', encoding='utf-8') as file:
  84.             file.writelines(unique_lines)
  85.        
  86.         return (duplicates_count, filtered_out, len(unique_lines))
  87.    
  88.     except UnicodeDecodeError:
  89.         # If UTF-8 fails, try with latin-1 which can handle any byte value
  90.         seen_lines.clear()
  91.         unique_lines.clear()
  92.         duplicates_count = 0
  93.         filtered_out = 0
  94.         total_lines = 0
  95.        
  96.         with open(input_file, 'r', encoding='latin-1', errors='replace') as file:
  97.             for line in file:
  98.                 total_lines += 1
  99.                
  100.                 # Remove trailing whitespace
  101.                 line_content = line.rstrip()
  102.                
  103.                 # Skip lines that are too short (if min_line_length > 0)
  104.                 if min_line_length > 0 and len(line_content) < min_line_length:
  105.                     filtered_out += 1
  106.                     continue
  107.                
  108.                 # Skip lines that are too long (if max_line_length > 0)
  109.                 if max_line_length > 0 and len(line_content) > max_line_length:
  110.                     filtered_out += 1
  111.                     continue
  112.                
  113.                 # Skip lines containing any of the strings in remove_contains
  114.                 should_remove = False
  115.                 for text in remove_contains:
  116.                     if text in line_content:
  117.                         should_remove = True
  118.                         break
  119.                
  120.                 if should_remove:
  121.                     filtered_out += 1
  122.                     continue
  123.                
  124.                 # Handle case sensitivity
  125.                 if not case_sensitive:
  126.                     comparison_key = line_content.lower()
  127.                 else:
  128.                     comparison_key = line_content
  129.                
  130.                 # Check if we've seen this line before
  131.                 if comparison_key not in seen_lines:
  132.                     seen_lines.add(comparison_key)
  133.                     unique_lines.append(line)
  134.                 else:
  135.                     duplicates_count += 1
  136.        
  137.         # Write unique lines to the output file
  138.         with open(output_file, 'w', encoding='latin-1') as file:
  139.             file.writelines(unique_lines)
  140.        
  141.         return (duplicates_count, filtered_out, len(unique_lines))
  142.  
  143. if __name__ == "__main__":
  144.     import sys
  145.     import os
  146.    
  147.     if len(sys.argv) < 2:
  148.         print("Usage: python remove_duplicates.py <input_file> [output_file]")
  149.         sys.exit(1)
  150.    
  151.     input_file = sys.argv[1]
  152.     output_file = sys.argv[2] if len(sys.argv) > 2 else None
  153.    
  154.     if not os.path.exists(input_file):
  155.         print(f"Error: Input file '{input_file}' not found.")
  156.         sys.exit(1)
  157.    
  158.     try:
  159.         duplicates, filtered, kept = remove_duplicate_lines(input_file, output_file)
  160.        
  161.         if verbose:
  162.             print(f"File processing complete:")
  163.             print(f"- Original file: {input_file}")
  164.             print(f"- Output file: {output_file if output_file else input_file + '.clean'}")
  165.             print(f"- Duplicate lines removed: {duplicates}")
  166.             print(f"- Lines filtered by criteria: {filtered}")
  167.             print(f"- Lines kept: {kept}")
  168.            
  169.             # Print filter settings
  170.             print("\nFilter settings used:")
  171.             print(f"- Case sensitive: {'Yes' if case_sensitive else 'No'}")
  172.             print(f"- Min line length: {min_line_length if min_line_length > 0 else 'No minimum'}")
  173.             print(f"- Max line length: {max_line_length if max_line_length > 0 else 'No maximum'}")
  174.             print(f"- Filtered strings: {', '.join(f'"{s}"' for s in remove_contains) if remove_contains else 'None'}")
  175.         else:
  176.             print(f"Removed {duplicates} duplicates, filtered {filtered} lines, kept {kept} lines.")
  177.    
  178.     except Exception as e:
  179.         print(f"Error: {e}")
  180.         sys.exit(1)
Add Comment
Please, Sign In to add comment