import os import re import tempfile def clean_and_merge_md_files(start_folder, exclude_keywords, output_file): def clean_file_content(file_path): """Clean the content of a single file and return the cleaned lines.""" with open(file_path, "r", encoding="utf-8") as f: content = f.readlines() cleaned_lines = [] inside_hint = False for i,line in enumerate(content): # Skip lines containing excluded keywords if any(keyword in line for keyword in exclude_keywords): continue # Detect and skip {% hint %} ... {% endhint %} blocks if "{% hint style=\"success\" %}" in line and "Learn & practice" in content[i+1]: inside_hint = True if "{% endhint %}" in line: inside_hint = False continue if inside_hint: continue # Skip lines with
...
if re.match(r"
.*?
", line): continue # Add the line if it passed all checks cleaned_lines.append(line.rstrip()) # Remove excess consecutive empty lines cleaned_lines = remove_consecutive_empty_lines(cleaned_lines) return cleaned_lines def remove_consecutive_empty_lines(lines): """Allow no more than one consecutive empty line.""" cleaned_lines = [] previous_line_empty = False for line in lines: if line.strip() == "": if not previous_line_empty: cleaned_lines.append("") previous_line_empty = True else: cleaned_lines.append(line) previous_line_empty = False return cleaned_lines def gather_files_in_order(start_folder): """Gather all .md files in a depth-first order.""" files = [] for root, _, filenames in os.walk(start_folder): md_files = sorted([os.path.join(root, f) for f in filenames if f.endswith(".md")]) files.extend(md_files) return files # Gather files in depth-first order all_files = gather_files_in_order(start_folder) # Process files and merge into a single output with open(output_file, "w", encoding="utf-8") as output: for file_path in all_files: # Clean the content of the file cleaned_content = clean_file_content(file_path) # Skip saving if the cleaned file has fewer than 10 non-empty lines if len([line for line in cleaned_content if line.strip()]) < 10: continue # Get the name of the file for the header file_name = os.path.basename(file_path) # Write header, cleaned content, and 2 extra new lines output.write(f"# {file_name}\n\n") output.write("\n".join(cleaned_content)) output.write("\n\n") def main(): # Specify the starting folder and output file start_folder = os.getcwd() output_file = os.path.join(tempfile.gettempdir(), "merged_output.md") # Keywords to exclude from lines exclude_keywords = [ "STM Cyber", # STM Cyber ads "offer several valuable cybersecurity services", # STM Cyber ads "and hack the unhackable", # STM Cyber ads "blog.stmcyber.com", # STM Cyber ads "RootedCON", # RootedCON ads "rootedcon.com", # RootedCON ads "the mission of promoting technical knowledge", # RootedCON ads "Intigriti", # Intigriti ads "intigriti.com", # Intigriti ads "Trickest", # Trickest ads "trickest.com", # Trickest ads, "Get Access Today:", "HACKENPROOF", # Hackenproof ads "hackenproof.com", # Hackenproof ads "HackenProof", # Hackenproof ads "discord.com/invite/N3FrSbmwdy", # Hackenproof ads "Hacking Insights:", # Hackenproof ads "Engage with content that delves", # Hackenproof ads "Real-Time Hack News:", # Hackenproof ads "Keep up-to-date with fast-paced", # Hackenproof ads "Latest Announcements:", # Hackenproof ads "Stay informed with the newest bug", # Hackenproof ads "start collaborating with top hackers today!", # Hackenproof ads "discord.com/invite/N3FrSbmwdy", # Hackenproof ads "Pentest-Tools", # Pentest-Tools.com ads "pentest-tools.com", # Pentest-Tools.com ads "perspective on your web apps, network, and", # Pentest-Tools.com ads "report critical, exploitable vulnerabilities with real business impact", # Pentest-Tools.com ads "SerpApi", # SerpApi ads "serpapi.com", # SerpApi ads "offers fast and easy real-time", # SerpApi ads "plans includes access to over 50 different APIs for scraping", # SerpApi ads "8kSec", # 8kSec ads "academy.8ksec.io", # 8kSec ads "Learn the technologies and skills required", # 8kSec ads "WebSec", # WebSec ads "websec.nl", # WebSec ads "which means they do it all; Pentesting", # WebSec ads ] # Clean and merge .md files clean_and_merge_md_files(start_folder, exclude_keywords, output_file) # Print the path to the output file print(f"Merged content has been saved to: {output_file}") if __name__ == "__main__": # Execute this from the hacktricks folder to clean # It will clean all the .md files and compile them into 1 in a proper order main()