Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import os
- import sys
- import json
- import pandas as pd
- import tika
- from tika import parser
- from openpyxl import load_workbook
- from openpyxl.utils.dataframe import dataframe_to_rows
- from openpyxl.styles import Alignment, PatternFill
- from openpyxl.worksheet.table import Table, TableStyleInfo
- from openpyxl.formatting.rule import FormulaRule
- # Initialize Tika VM
- tika.initVM()
- def process_pdf(cispdf, outfile):
- # JSON and Excel file names
- cisjson = f"{outfile}.json"
- cisexcel = f"{outfile}.xlsx"
- cistext = 'cis_text.txt'
- # Convert PDF to text
- print(f"[+] Converting '{cispdf}' to text...")
- raw = parser.from_file(cispdf)
- data = raw['content']
- # Create a temporary text file
- print("[+] Creating temp text file...")
- with open(cistext, 'w', encoding='utf-8') as f:
- f.write(data)
- # Remove blank lines and write to temp file
- with open(cistext, 'r', encoding='utf-8') as filer, open('temp.txt', 'w', encoding='utf-8') as filew:
- for line in filer:
- if not line.strip():
- continue
- filew.write(line)
- # Initialize flags and variables
- flagStart, flagDesc, flagAudit, flagRecom, flagComplete = False, False, False, False, False
- cis_title, cis_desc, cis_audit, cis_recom = "", "", "", ""
- listObj = []
- start_processing = False # Flag to indicate when to start processing lines
- # Convert text to JSON format
- print("[+] Converting to JSON...")
- with open("temp.txt", 'r', encoding='utf-8') as filer:
- for line in filer:
- # Check for the start marker "Appendix: Change History"
- if "Appendix: Change History" in line:
- start_processing = True
- continue # Skip the line containing the start marker
- if "CIS Benchmark Recommendation Set" in line:
- break
- # Skip processing until the start marker is found
- if not start_processing:
- continue
- if not line.strip():
- continue
- line = line.strip()
- x = {} # JSON object
- # Check if line matches the CIS title format
- if re.match(r"^[0-9]+\.[0-9]+(\.[0-9]+)* \((L[1-9]|BL)\) .*(?<!\d)$", line):
- # Split the line into parts based on spaces
- parts = line.split(' ', 2) # Split into three parts: section, level, and the rest
- # Extract section, level, and title
- section = parts[0] # The first part is the section
- level = parts[1] if parts[1].startswith('(') else "" # The second part is the level if it starts with '('
- title = parts[2] if len(parts) > 2 else "" # The rest is the title
- # Read and concatenate lines until "Profile Applicability:" is found
- while "Profile Applicability:" not in line:
- next_line = next(filer, '').strip() # Read the next line
- if "Profile Applicability:" in next_line:
- break
- title += " " + next_line
- flagStart, flagDesc, flagAudit, flagRecom, flagComplete = True, False, False, False, False
- if flagStart:
- # Get description - capture everything between 'Description:' and 'Rationale:'
- if "Description:" in line:
- flagDesc = True
- continue # Skip the "Description:" line itself
- if flagDesc and "Rationale:" not in line:
- cis_desc += line
- if "Rationale:" in line:
- flagDesc = False
- # Get Audit - capture everything between 'Audit:' and 'Remediation:'
- if "Audit:" in line:
- flagAudit = True
- continue # Skip the "Audit:" line itself
- if flagAudit and "Remediation:" not in line:
- cis_audit += line
- # Check for Remediation and start capturing
- if "Remediation:" in line:
- flagAudit = False
- flagRecom = True
- # Start formatting Remediation section
- if "Remediation:" not in cis_recom:
- cis_recom += "\n\nRemediation:\n" + line.replace('Remediation:', '').strip()
- continue # Continue processing the next lines for remediation
- # Get Remediation - capture everything between 'Remediation:' and 'References:'
- if flagRecom:
- # Stop capturing if any of these markers are found
- if "References:" in line or "Additional Information:" in line or "CIS Controls:" in line:
- flagRecom = False
- flagComplete = True
- else:
- # Insert specific formatting and handle different sections
- if "Remediation:" in line:
- # Ensure Remediation section is correctly formatted
- if "Remediation:" not in cis_recom:
- cis_recom += "\n\nRemediation:\n" + line.replace('Remediation:', '').strip()
- else:
- cis_recom += " " + line.strip() # Continue appending to Remediation section
- elif "Default Value:" in line:
- cis_recom += "\n\nDefault Value:\n" + line.replace('Default Value:', '').strip()
- elif "Additional Information:" in line:
- cis_recom += "\n\nAdditional Information:\n" + line.replace('Additional Information:', '').strip()
- else:
- # Add content normally, with a space to separate from previous content
- cis_recom += " " + line.strip()
- if flagComplete:
- # Clean up the extracted text
- title = title.replace('\n', ' ')
- cis_desc = cis_desc.replace('\n', ' ').replace('Rationale:', '').replace('| P a g e', '')
- cis_audit = cis_audit.replace('\n', ' ').replace('Remediation:', '').replace('| P a g e', '')
- # Clean up the extracted text before saving
- cis_recom = re.sub(r'\bPage \d+\b', '', cis_recom).strip()
- x['section'] = section
- x['level'] = level
- x['title'] = title
- x['description'] = cis_desc
- x['audit'] = cis_audit
- x['recommendations'] = cis_recom
- # Reset for the next section
- cis_title, cis_desc, cis_audit, cis_recom = "", "", "", ""
- flagStart, flagDesc, flagAudit, flagRecom, flagComplete = False, False, False, False, False
- listObj.append(x)
- print(f"[+] Writing to '{cisjson}' ...")
- with open(cisjson, 'w') as json_file:
- json.dump(listObj, json_file, indent=4, separators=(',', ': '))
- # Create Excel file from JSON data
- print(f"[+] Creating '{cisexcel}' ...")
- df_json = pd.read_json(cisjson)
- # Add the additional columns: Intune Policy and Exception
- df_json.insert(3, 'Intune Policy', '')
- df_json.insert(4, 'Exception', '')
- # Add the 'Exception Reason' column to the end
- df_json['Exception Reason'] = ''
- # Save initial Excel file
- df_json.to_excel(cisexcel, index=False)
- # Load workbook and select the active sheet
- wb = load_workbook(cisexcel)
- ws = wb.active
- # Set column widths and apply word wrap
- column_widths = {
- 'C': 45, # Title column
- 'D': 30, # Intune Policy column width (adjust as needed)
- 'E': 15, # Exception column width (adjust as needed)
- 'F': 74, # Description column
- 'G': 74, # Audit column
- 'H': 74, # Recommendations column
- 'I': 70, # Exception Reason column width (adjust as needed)
- }
- # Apply formatting for each column
- for col in ws.columns:
- column = col[0].column_letter # Get the column letter
- for cell in col:
- # Apply word wrap for relevant columns
- if column in ['A', 'B', 'C', 'D', 'E']: # Columns A to E to vertically middle align
- cell.alignment = Alignment(wrap_text=True, vertical='center')
- if column in ['F', 'G', 'H']: # Columns to word wrap
- cell.alignment = Alignment(wrap_text=True)
- # Set the column width
- if column in column_widths:
- ws.column_dimensions[column].width = column_widths[column]
- # Define table range and create a Table object
- table_range = f"A1:{ws.cell(row=ws.max_row, column=ws.max_column).coordinate}"
- table = Table(displayName="IntunePolicyTable", ref=table_range)
- # Apply table style
- style = TableStyleInfo(
- name="TableStyleLight9", # Blue, Table Style Medium 9
- showFirstColumn=False,
- showLastColumn=False,
- showRowStripes=True,
- showColumnStripes=True
- )
- table.tableStyleInfo = style
- # Add the table to the worksheet
- ws.add_table(table)
- # Define colors for conditional formatting
- green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") # Light green
- red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") # Light red
- # Apply conditional formatting: If there is a value in "Intune Policy" (Column D), fill A-C with green
- ws.conditional_formatting.add(
- 'A2:C{}'.format(ws.max_row),
- FormulaRule(
- formula=['NOT(ISBLANK($D2))'], # Reference to the D column to check for values
- stopIfTrue=True,
- fill=green_fill
- )
- )
- # Apply conditional formatting: If there is a value in "Exception" (Column E), fill A-C with red
- ws.conditional_formatting.add(
- 'A2:C{}'.format(ws.max_row),
- FormulaRule(
- formula=['NOT(ISBLANK($E2))'], # Reference to the E column to check for values
- stopIfTrue=True,
- fill=red_fill
- )
- )
- # Freeze the first three columns
- ws.freeze_panes = 'D1'
- # Save the updated Excel file
- wb.save(cisexcel)
- print("[+] Done!")
- def main():
- if len(sys.argv) < 2:
- print("[!] Please provide the input folder name!")
- print("Usage: python {} <input_folder>\n".format(sys.argv[0]))
- exit()
- input_folder = sys.argv[1]
- # Iterate over all files in the folder
- for filename in os.listdir(input_folder):
- if filename.startswith("CIS_") and filename.endswith(".pdf"):
- cispdf = os.path.join(input_folder, filename)
- outfile = os.path.join(input_folder, os.path.splitext(filename)[0])
- process_pdf(cispdf, outfile)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement