mirror of
https://github.com/HackTricks-wiki/hacktricks-cloud.git
synced 2025-12-12 15:50:19 -08:00
Create clean_for_ai
This commit is contained in:
102
scripts/clean_for_ai
Normal file
102
scripts/clean_for_ai
Normal file
@@ -0,0 +1,102 @@
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
def clean_and_merge_md_files(start_folder, exclude_keywords, output_file):
|
||||
def clean_file_content(file_path):
|
||||
"""Clean the content of a single file and return the cleaned lines."""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.readlines()
|
||||
|
||||
cleaned_lines = []
|
||||
inside_hint = False
|
||||
for line in content:
|
||||
# Skip lines containing excluded keywords
|
||||
if any(keyword in line for keyword in exclude_keywords):
|
||||
continue
|
||||
|
||||
# Detect and skip {% hint %} ... {% endhint %} blocks
|
||||
if "{% hint style=\"success\" %}" in line:
|
||||
inside_hint = True
|
||||
if "{% endhint %}" in line:
|
||||
inside_hint = False
|
||||
continue
|
||||
if inside_hint:
|
||||
continue
|
||||
|
||||
# Skip lines with <figure> ... </figure>
|
||||
if re.match(r"<figure>.*?</figure>", line):
|
||||
continue
|
||||
|
||||
# Add the line if it passed all checks
|
||||
cleaned_lines.append(line.rstrip())
|
||||
|
||||
# Remove excess consecutive empty lines
|
||||
cleaned_lines = remove_consecutive_empty_lines(cleaned_lines)
|
||||
return cleaned_lines
|
||||
|
||||
def remove_consecutive_empty_lines(lines):
|
||||
"""Allow no more than one consecutive empty line."""
|
||||
cleaned_lines = []
|
||||
previous_line_empty = False
|
||||
for line in lines:
|
||||
if line.strip() == "":
|
||||
if not previous_line_empty:
|
||||
cleaned_lines.append("")
|
||||
previous_line_empty = True
|
||||
else:
|
||||
cleaned_lines.append(line)
|
||||
previous_line_empty = False
|
||||
return cleaned_lines
|
||||
|
||||
def gather_files_in_order(start_folder):
|
||||
"""Gather all .md files in a depth-first order."""
|
||||
files = []
|
||||
for root, _, filenames in os.walk(start_folder):
|
||||
md_files = sorted([os.path.join(root, f) for f in filenames if f.endswith(".md")])
|
||||
files.extend(md_files)
|
||||
return files
|
||||
|
||||
# Gather files in depth-first order
|
||||
all_files = gather_files_in_order(start_folder)
|
||||
|
||||
# Process files and merge into a single output
|
||||
with open(output_file, "w", encoding="utf-8") as output:
|
||||
for file_path in all_files:
|
||||
# Clean the content of the file
|
||||
cleaned_content = clean_file_content(file_path)
|
||||
|
||||
# Skip saving if the cleaned file has fewer than 10 non-empty lines
|
||||
if len([line for line in cleaned_content if line.strip()]) < 10:
|
||||
continue
|
||||
|
||||
# Get the name of the file for the header
|
||||
file_name = os.path.basename(file_path)
|
||||
|
||||
# Write header, cleaned content, and 2 extra new lines
|
||||
output.write(f"# {file_name}\n\n")
|
||||
output.write("\n".join(cleaned_content))
|
||||
output.write("\n\n")
|
||||
|
||||
def main():
|
||||
# Specify the starting folder and output file
|
||||
start_folder = os.getcwd()
|
||||
output_file = os.path.join(tempfile.gettempdir(), "merged_output.md")
|
||||
|
||||
# Keywords to exclude from lines
|
||||
exclude_keywords = [
|
||||
"Keyword1", # Replace with your keywords
|
||||
"Keyword2",
|
||||
"HackTricks", # Example
|
||||
]
|
||||
|
||||
# Clean and merge .md files
|
||||
clean_and_merge_md_files(start_folder, exclude_keywords, output_file)
|
||||
|
||||
# Print the path to the output file
|
||||
print(f"Merged content has been saved to: {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Execute this from the hacktricks folder to clean
|
||||
# It will clean all the .md files and compile them into 1 in a proper order
|
||||
main()
|
||||
Reference in New Issue
Block a user