Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import multiprocessing as mp
- def read_lines_in_range(file_path, start_line, end_line, keyword):
- """
- Reads lines from a file within a specified range without loading the entire file into memory.
- Stops when the keyword is found.
- :param file_path: Path to the text file.
- :param start_line: Starting line number (inclusive).
- :param end_line: Ending line number (inclusive).
- :param keyword: The keyword to search for in the file.
- :return: The line containing the keyword or None if not found.
- """
- try:
- with open(file_path, 'r', encoding='utf-8') as file:
- # Seek to the start line
- for current_line in range(1, start_line):
- next(file)
- # Read lines within the specified range
- for current_line in range(start_line, end_line + 1):
- try:
- line = next(file).strip()
- except StopIteration:
- break
- if keyword.lower() in line.lower():
- return line
- except FileNotFoundError:
- print(f"Error: The file '{file_path}' was not found.")
- except UnicodeDecodeError:
- print("Error: The file encoding is not UTF-8.")
- except Exception as e:
- print(f"An error occurred: {e}")
- return None
- def process_chunk(file_path, start_line, chunk_size, keyword):
- """
- Process a chunk of lines to search for the keyword.
- Stops when the keyword is found in the chunk.
- :param file_path: Path to the text file.
- :param start_line: Starting line number of the chunk.
- :param chunk_size: Number of lines in the chunk.
- :param keyword: The keyword to search for in the file.
- :return: The line containing the keyword or None if not found.
- """
- end_line = start_line + chunk_size - 1
- return read_lines_in_range(file_path, start_line, end_line, keyword)
- if __name__ == "__main__":
- file_path = 'text.txt'
- start_line = 0
- end_line = 40000000
- chunk_size = 10000 # Keep chunk size for balancing
- keyword = 'something@gmail.com'
- chunks = [(file_path, i, chunk_size, keyword) for i in range(start_line, end_line, chunk_size)]
- with mp.Pool(mp.cpu_count()) as pool:
- results = pool.starmap(process_chunk, chunks)
- hits = [result for result in results if result is not None]
- total_hits = len(hits)
- print(f'Total hits in document: {total_hits}')
- # Print lines where the keyword was found
- for line in hits:
- print(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement