Advertisement
johnfikennedy

Untitled

May 25th, 2024
590
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.57 KB | Cybersecurity | 0 0
  1. import multiprocessing as mp
  2.  
  3. def read_lines_in_range(file_path, start_line, end_line, keyword):
  4.     """
  5.    Reads lines from a file within a specified range without loading the entire file into memory.
  6.    Stops when the keyword is found.
  7.    
  8.    :param file_path: Path to the text file.
  9.    :param start_line: Starting line number (inclusive).
  10.    :param end_line: Ending line number (inclusive).
  11.    :param keyword: The keyword to search for in the file.
  12.    :return: The line containing the keyword or None if not found.
  13.    """
  14.     try:
  15.         with open(file_path, 'r', encoding='utf-8') as file:
  16.             # Seek to the start line
  17.             for current_line in range(1, start_line):
  18.                 next(file)
  19.            
  20.             # Read lines within the specified range
  21.             for current_line in range(start_line, end_line + 1):
  22.                 try:
  23.                     line = next(file).strip()
  24.                 except StopIteration:
  25.                     break
  26.                 if keyword.lower() in line.lower():
  27.                     return line
  28.     except FileNotFoundError:
  29.         print(f"Error: The file '{file_path}' was not found.")
  30.     except UnicodeDecodeError:
  31.         print("Error: The file encoding is not UTF-8.")
  32.     except Exception as e:
  33.         print(f"An error occurred: {e}")
  34.     return None
  35.  
  36. def process_chunk(file_path, start_line, chunk_size, keyword):
  37.     """
  38.    Process a chunk of lines to search for the keyword.
  39.    Stops when the keyword is found in the chunk.
  40.    
  41.    :param file_path: Path to the text file.
  42.    :param start_line: Starting line number of the chunk.
  43.    :param chunk_size: Number of lines in the chunk.
  44.    :param keyword: The keyword to search for in the file.
  45.    :return: The line containing the keyword or None if not found.
  46.    """
  47.     end_line = start_line + chunk_size - 1
  48.     return read_lines_in_range(file_path, start_line, end_line, keyword)
  49.  
  50. if __name__ == "__main__":
  51.     file_path = 'text.txt'
  52.     start_line = 0
  53.     end_line = 40000000
  54.     chunk_size = 10000  # Keep chunk size for balancing
  55.     keyword = 'something@gmail.com'
  56.    
  57.     chunks = [(file_path, i, chunk_size, keyword) for i in range(start_line, end_line, chunk_size)]
  58.  
  59.     with mp.Pool(mp.cpu_count()) as pool:
  60.         results = pool.starmap(process_chunk, chunks)
  61.  
  62.     hits = [result for result in results if result is not None]
  63.     total_hits = len(hits)
  64.    
  65.     print(f'Total hits in document: {total_hits}')
  66.    
  67.     # Print lines where the keyword was found
  68.     for line in hits:
  69.         print(line)
  70.  
  71.  
  72.  
  73.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement