Create clean_for_ai

This commit is contained in:
SirBroccoli
2024-12-16 01:33:26 +01:00
committed by GitHub
parent ed3d72de51
commit 7418e06621

102
scripts/clean_for_ai Normal file
View File

@@ -0,0 +1,102 @@
import os
import re
import tempfile
def clean_and_merge_md_files(start_folder, exclude_keywords, output_file):
def clean_file_content(file_path):
"""Clean the content of a single file and return the cleaned lines."""
with open(file_path, "r", encoding="utf-8") as f:
content = f.readlines()
cleaned_lines = []
inside_hint = False
for line in content:
# Skip lines containing excluded keywords
if any(keyword in line for keyword in exclude_keywords):
continue
# Detect and skip {% hint %} ... {% endhint %} blocks
if "{% hint style=\"success\" %}" in line:
inside_hint = True
if "{% endhint %}" in line:
inside_hint = False
continue
if inside_hint:
continue
# Skip lines with <figure> ... </figure>
if re.match(r"<figure>.*?</figure>", line):
continue
# Add the line if it passed all checks
cleaned_lines.append(line.rstrip())
# Remove excess consecutive empty lines
cleaned_lines = remove_consecutive_empty_lines(cleaned_lines)
return cleaned_lines
def remove_consecutive_empty_lines(lines):
"""Allow no more than one consecutive empty line."""
cleaned_lines = []
previous_line_empty = False
for line in lines:
if line.strip() == "":
if not previous_line_empty:
cleaned_lines.append("")
previous_line_empty = True
else:
cleaned_lines.append(line)
previous_line_empty = False
return cleaned_lines
def gather_files_in_order(start_folder):
"""Gather all .md files in a depth-first order."""
files = []
for root, _, filenames in os.walk(start_folder):
md_files = sorted([os.path.join(root, f) for f in filenames if f.endswith(".md")])
files.extend(md_files)
return files
# Gather files in depth-first order
all_files = gather_files_in_order(start_folder)
# Process files and merge into a single output
with open(output_file, "w", encoding="utf-8") as output:
for file_path in all_files:
# Clean the content of the file
cleaned_content = clean_file_content(file_path)
# Skip saving if the cleaned file has fewer than 10 non-empty lines
if len([line for line in cleaned_content if line.strip()]) < 10:
continue
# Get the name of the file for the header
file_name = os.path.basename(file_path)
# Write header, cleaned content, and 2 extra new lines
output.write(f"# {file_name}\n\n")
output.write("\n".join(cleaned_content))
output.write("\n\n")
def main():
# Specify the starting folder and output file
start_folder = os.getcwd()
output_file = os.path.join(tempfile.gettempdir(), "merged_output.md")
# Keywords to exclude from lines
exclude_keywords = [
"Keyword1", # Replace with your keywords
"Keyword2",
"HackTricks", # Example
]
# Clean and merge .md files
clean_and_merge_md_files(start_folder, exclude_keywords, output_file)
# Print the path to the output file
print(f"Merged content has been saved to: {output_file}")
if __name__ == "__main__":
# Execute this from the hacktricks folder to clean
# It will clean all the .md files and compile them into 1 in a proper order
main()