From 4bbfe221f23a8b1ebf193578eb88e5b21f19c712 Mon Sep 17 00:00:00 2001 From: Benexl Date: Thu, 24 Jul 2025 23:36:22 +0300 Subject: [PATCH] feat: refactor provider imports and enhance HTML parsing utilities --- fa | 2 +- .../provider/anime/animepahe/extractor.py | 11 +- .../libs/provider/anime/animepahe/provider.py | 13 +- fastanime/libs/provider/anime/utils/debug.py | 2 +- fastanime/libs/provider/scraping/__init__.py | 0 .../libs/provider/scraping/html_parser.py | 474 ++++++++++++++++++ .../libs/provider/scraping/user_agents.py | 238 +++++++++ fastanime/libs/provider/scraping/utils.py | 264 ++++++++++ 8 files changed, 992 insertions(+), 12 deletions(-) create mode 100644 fastanime/libs/provider/scraping/__init__.py create mode 100644 fastanime/libs/provider/scraping/html_parser.py create mode 100644 fastanime/libs/provider/scraping/user_agents.py create mode 100644 fastanime/libs/provider/scraping/utils.py diff --git a/fa b/fa index 1fac532..aa076fb 100755 --- a/fa +++ b/fa @@ -3,4 +3,4 @@ provider_type=$1 provider_name=$2 [ -z "$provider_type" ] && echo "Please specify provider type" && exit [ -z "$provider_name" ] && echo "Please specify provider type" && exit -uv run python -m fastanime.libs.providers.${provider_type}.${provider_name}.provider +uv run python -m fastanime.libs.provider.${provider_type}.${provider_name}.provider diff --git a/fastanime/libs/provider/anime/animepahe/extractor.py b/fastanime/libs/provider/anime/animepahe/extractor.py index 6b82251..349aa74 100644 --- a/fastanime/libs/provider/anime/animepahe/extractor.py +++ b/fastanime/libs/provider/anime/animepahe/extractor.py @@ -2,7 +2,7 @@ import re def animepahe_key_creator(c: int, a: int): - from yt_dlp.utils import encode_base_n + from ...scraping.utils import encode_base_n if c < a: val_a = "" @@ -37,17 +37,18 @@ ENCODE_JS_REGEX = re.compile(r"'(.*?);',(\d+),(\d+),'(.*)'\.split") def process_animepahe_embed_page(embed_page: str): - from yt_dlp.utils import get_element_text_and_html_by_tag + from ...scraping.html_parser import get_element_text_and_html_by_tag encoded_js_string = "" embed_page_content = embed_page for _ in range(8): text, html = get_element_text_and_html_by_tag("script", embed_page_content) - if not text: + if not text and html: embed_page_content = re.sub(html, "", embed_page_content) continue - encoded_js_string = text.strip() - break + if text: + encoded_js_string = text.strip() + break if not encoded_js_string: return obsfucated_js_parameter_match = PARAMETERS_REGEX.search(encoded_js_string) diff --git a/fastanime/libs/provider/anime/animepahe/provider.py b/fastanime/libs/provider/anime/animepahe/provider.py index 2d3b80e..88da63b 100644 --- a/fastanime/libs/provider/anime/animepahe/provider.py +++ b/fastanime/libs/provider/anime/animepahe/provider.py @@ -106,8 +106,7 @@ class AnimePahe(BaseAnimeProvider): @debug_provider def episode_streams(self, params: EpisodeStreamsParams) -> Iterator[Server] | None: - # TODO: replace with custom implementations using default html parser or lxml - from yt_dlp.utils import ( + from ...scraping.html_parser import ( extract_attributes, get_element_by_id, get_elements_html_by_class, @@ -125,6 +124,9 @@ class AnimePahe(BaseAnimeProvider): response.raise_for_status() c = get_element_by_id("resolutionMenu", response.text) + if not c: + logger.error("Resolution menu not found in the response") + return resolutionMenuItems = get_elements_html_by_class("dropdown-item", c) res_dicts = [extract_attributes(item) for item in resolutionMenuItems] quality = None @@ -133,8 +135,9 @@ class AnimePahe(BaseAnimeProvider): # TODO: better document the scraping process for res_dict in res_dicts: - embed_url = res_dict["data-src"] - data_audio = "dub" if res_dict["data-audio"] == "eng" else "sub" + # the actual attributes are data attributes in the original html 'prefixed with data-' + embed_url = res_dict["src"] + data_audio = "dub" if res_dict["audio"] == "eng" else "sub" if data_audio != params.translation_type: continue @@ -162,7 +165,7 @@ class AnimePahe(BaseAnimeProvider): logger.error("failed to find juicy stream") continue juicy_stream = juicy_stream.group(1) - quality = res_dict["data-resolution"] + quality = res_dict["resolution"] translation_type = data_audio stream_link = juicy_stream diff --git a/fastanime/libs/provider/anime/utils/debug.py b/fastanime/libs/provider/anime/utils/debug.py index 332b985..a8d00b8 100644 --- a/fastanime/libs/provider/anime/utils/debug.py +++ b/fastanime/libs/provider/anime/utils/debug.py @@ -36,7 +36,7 @@ def test_anime_provider(AnimeProvider: Type[BaseAnimeProvider]): anime_provider = AnimeProvider( Client(headers={"User-Agent": random_user_agent(), **AnimeProvider.HEADERS}) ) - print(APP_ASCII_ART) + print(APP_ASCII_ART.read_text(encoding="utf-8")) query = input("What anime would you like to stream: ") search_results = anime_provider.search(SearchParams(query=query)) if not search_results: diff --git a/fastanime/libs/provider/scraping/__init__.py b/fastanime/libs/provider/scraping/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastanime/libs/provider/scraping/html_parser.py b/fastanime/libs/provider/scraping/html_parser.py new file mode 100644 index 0000000..c6f9434 --- /dev/null +++ b/fastanime/libs/provider/scraping/html_parser.py @@ -0,0 +1,474 @@ +""" +HTML parsing utilities with optional lxml support. + +This module provides comprehensive HTML parsing capabilities using either +Python's built-in html.parser or lxml for better performance when available. +""" +# TODO: Review and optimize the HTML parsing logic for better performance and flexibility. +# Consider adding more utility functions for common HTML manipulation tasks. +import logging +import re +from html.parser import HTMLParser as BaseHTMLParser +from typing import Dict, List, Optional, Tuple, Union + +logger = logging.getLogger(__name__) + +# Try to import lxml +HAS_LXML = False +try: + from lxml import etree, html as lxml_html + HAS_LXML = True + logger.debug("lxml is available and will be used for HTML parsing") +except ImportError: + logger.debug("lxml not available, falling back to html.parser") + + +class HTMLParserConfig: + """Configuration for HTML parser selection.""" + + def __init__(self, use_lxml: Optional[bool] = None): + """ + Initialize parser configuration. + + Args: + use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None) + """ + if use_lxml is None: + self.use_lxml = HAS_LXML + else: + self.use_lxml = use_lxml and HAS_LXML + + if use_lxml and not HAS_LXML: + logger.warning("lxml requested but not available, falling back to html.parser") + + +class HTMLParser: + """ + Comprehensive HTML parser with optional lxml support. + + Provides a unified interface for HTML parsing operations regardless + of the underlying parser implementation. + """ + + def __init__(self, config: Optional[HTMLParserConfig] = None): + """Initialize the HTML parser with configuration.""" + self.config = config or HTMLParserConfig() + + def parse(self, html_content: str) -> Union[etree._Element, 'ParsedHTML']: + """ + Parse HTML content and return a parsed tree. + + Args: + html_content: Raw HTML string to parse + + Returns: + Parsed HTML tree (lxml Element or custom ParsedHTML object) + """ + if self.config.use_lxml: + return self._parse_with_lxml(html_content) + else: + return self._parse_with_builtin(html_content) + + def _parse_with_lxml(self, html_content: str) -> etree._Element: + """Parse HTML using lxml.""" + try: + # Use lxml's HTML parser which is more lenient + return lxml_html.fromstring(html_content) + except Exception as e: + logger.warning(f"lxml parsing failed: {e}, falling back to html.parser") + return self._parse_with_builtin(html_content) + + def _parse_with_builtin(self, html_content: str) -> 'ParsedHTML': + """Parse HTML using Python's built-in parser.""" + parser = BuiltinHTMLParser() + parser.feed(html_content) + return ParsedHTML(parser.elements, html_content) + + +class BuiltinHTMLParser(BaseHTMLParser): + """Enhanced HTML parser using Python's built-in capabilities.""" + + def __init__(self): + super().__init__() + self.elements = [] + self.current_element = None + self.element_stack = [] + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): + """Handle opening tags.""" + element = { + 'tag': tag, + 'attrs': dict(attrs), + 'text': '', + 'children': [], + 'start_pos': self.getpos(), + } + + if self.element_stack: + self.element_stack[-1]['children'].append(element) + else: + self.elements.append(element) + + self.element_stack.append(element) + + def handle_endtag(self, tag: str): + """Handle closing tags.""" + if self.element_stack and self.element_stack[-1]['tag'] == tag: + element = self.element_stack.pop() + element['end_pos'] = self.getpos() + + def handle_data(self, data: str): + """Handle text content.""" + if self.element_stack: + self.element_stack[-1]['text'] += data + + +class ParsedHTML: + """Wrapper for parsed HTML using built-in parser.""" + + def __init__(self, elements: List[Dict], raw_html: str): + self.elements = elements + self.raw_html = raw_html + + def find_by_id(self, element_id: str) -> Optional[Dict]: + """Find element by ID.""" + return self._find_recursive(self.elements, lambda el: el['attrs'].get('id') == element_id) + + def find_by_class(self, class_name: str) -> List[Dict]: + """Find elements by class name.""" + results = [] + self._find_all_recursive( + self.elements, + lambda el: class_name in el['attrs'].get('class', '').split(), + results + ) + return results + + def find_by_tag(self, tag_name: str) -> List[Dict]: + """Find elements by tag name.""" + results = [] + self._find_all_recursive( + self.elements, + lambda el: el['tag'].lower() == tag_name.lower(), + results + ) + return results + + def _find_recursive(self, elements: List[Dict], condition) -> Optional[Dict]: + """Recursively find first element matching condition.""" + for element in elements: + if condition(element): + return element + result = self._find_recursive(element['children'], condition) + if result: + return result + return None + + def _find_all_recursive(self, elements: List[Dict], condition, results: List[Dict]): + """Recursively find all elements matching condition.""" + for element in elements: + if condition(element): + results.append(element) + self._find_all_recursive(element['children'], condition, results) + + +# Global parser instance +_default_parser = HTMLParser() + + +def extract_attributes(html_element: str) -> Dict[str, str]: + """ + Extract attributes from an HTML element string. + + Args: + html_element: HTML element as string (e.g., '
') + + Returns: + Dictionary of attribute name-value pairs + + Examples: + >>> extract_attributes('
') + {'class': 'test', 'id': 'main'} + """ + if not html_element: + return {} + + # Use regex to extract attributes from HTML string + attr_pattern = r'(\w+)=(["\'])([^"\']*?)\2' + matches = re.findall(attr_pattern, html_element) + + attributes = {} + for match in matches: + attr_name, _, attr_value = match + attributes[attr_name] = attr_value + + # Handle attributes without quotes + unquoted_pattern = r'(\w+)=([^\s>]+)' + unquoted_matches = re.findall(unquoted_pattern, html_element) + for attr_name, attr_value in unquoted_matches: + if attr_name not in attributes: + attributes[attr_name] = attr_value + + return attributes + + +def get_element_by_id(element_id: str, html_content: str) -> Optional[str]: + """ + Get HTML element by ID. + + Args: + element_id: The ID attribute value to search for + html_content: HTML content to search in + + Returns: + HTML string of the element or None if not found + + Examples: + >>> html = '
Content
' + >>> get_element_by_id("test", html) + '
Content
' + """ + parsed = _default_parser.parse(html_content) + + if _default_parser.config.use_lxml: + try: + element = parsed.xpath(f'//*[@id="{element_id}"]') + if element: + return etree.tostring(element[0], encoding='unicode', method='html') + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + return None + else: + element = parsed.find_by_id(element_id) + if element: + return _element_to_html(element, html_content) + + return None + + +def get_element_by_tag(tag_name: str, html_content: str) -> Optional[str]: + """ + Get first HTML element by tag name. + + Args: + tag_name: The tag name to search for + html_content: HTML content to search in + + Returns: + HTML string of the element or None if not found + """ + parsed = _default_parser.parse(html_content) + + if _default_parser.config.use_lxml: + try: + elements = parsed.xpath(f'//{tag_name}') + if elements: + return etree.tostring(elements[0], encoding='unicode', method='html') + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + return None + else: + elements = parsed.find_by_tag(tag_name) + if elements: + return _element_to_html(elements[0], html_content) + + return None + + +def get_element_by_class(class_name: str, html_content: str) -> Optional[str]: + """ + Get first HTML element by class name. + + Args: + class_name: The class name to search for + html_content: HTML content to search in + + Returns: + HTML string of the element or None if not found + """ + parsed = _default_parser.parse(html_content) + + if _default_parser.config.use_lxml: + try: + elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]') + if elements: + return etree.tostring(elements[0], encoding='unicode', method='html') + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + return None + else: + elements = parsed.find_by_class(class_name) + if elements: + return _element_to_html(elements[0], html_content) + + return None + + +def get_elements_by_tag(tag_name: str, html_content: str) -> List[str]: + """ + Get all HTML elements by tag name. + + Args: + tag_name: The tag name to search for + html_content: HTML content to search in + + Returns: + List of HTML strings for matching elements + """ + parsed = _default_parser.parse(html_content) + results = [] + + if _default_parser.config.use_lxml: + try: + elements = parsed.xpath(f'//{tag_name}') + for element in elements: + results.append(etree.tostring(element, encoding='unicode', method='html')) + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + else: + elements = parsed.find_by_tag(tag_name) + for element in elements: + results.append(_element_to_html(element, html_content)) + + return results + + +def get_elements_by_class(class_name: str, html_content: str) -> List[str]: + """ + Get all HTML elements by class name. + + Args: + class_name: The class name to search for + html_content: HTML content to search in + + Returns: + List of HTML strings for matching elements + """ + parsed = _default_parser.parse(html_content) + results = [] + + if _default_parser.config.use_lxml: + try: + elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]') + for element in elements: + results.append(etree.tostring(element, encoding='unicode', method='html')) + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + else: + elements = parsed.find_by_class(class_name) + for element in elements: + results.append(_element_to_html(element, html_content)) + + return results + + +def get_elements_html_by_class(class_name: str, html_content: str) -> List[str]: + """ + Get HTML strings of elements by class name. + + This is an alias for get_elements_by_class for yt-dlp compatibility. + + Args: + class_name: The class name to search for + html_content: HTML content to search in + + Returns: + List of HTML strings for matching elements + """ + return get_elements_by_class(class_name, html_content) + + +def get_element_text_and_html_by_tag(tag_name: str, html_content: str) -> Tuple[Optional[str], Optional[str]]: + """ + Get both text content and HTML of first element by tag name. + + Args: + tag_name: The tag name to search for + html_content: HTML content to search in + + Returns: + Tuple of (text_content, html_string) or (None, None) if not found + + Examples: + >>> html = '' + >>> get_element_text_and_html_by_tag("script", html) + ('alert("test");', '') + """ + parsed = _default_parser.parse(html_content) + + if _default_parser.config.use_lxml: + try: + elements = parsed.xpath(f'//{tag_name}') + if elements: + element = elements[0] + text = element.text_content() if hasattr(element, 'text_content') else (element.text or '') + html_str = etree.tostring(element, encoding='unicode', method='html') + return text, html_str + except Exception as e: + logger.warning(f"lxml XPath search failed: {e}") + return None, None + else: + elements = parsed.find_by_tag(tag_name) + if elements: + element = elements[0] + text = _extract_text_content(element) + html_str = _element_to_html(element, html_content) + return text, html_str + + return None, None + + +def _element_to_html(element: Dict, original_html: str) -> str: + """ + Convert parsed element back to HTML string. + + This is a simplified implementation that reconstructs HTML from parsed data. + For production use, consider using lxml for better accuracy. + """ + if not element: + return "" + + # Build opening tag + tag = element['tag'] + attrs = element.get('attrs', {}) + attr_str = ' '.join(f'{k}="{v}"' for k, v in attrs.items() if v is not None) + + if attr_str: + opening_tag = f"<{tag} {attr_str}>" + else: + opening_tag = f"<{tag}>" + + # Add text content + text = element.get('text', '') + + # Add children + children_html = "" + for child in element.get('children', []): + children_html += _element_to_html(child, original_html) + + # Build closing tag + closing_tag = f"" + + return f"{opening_tag}{text}{children_html}{closing_tag}" + + +def _extract_text_content(element: Dict) -> str: + """Extract all text content from element and its children.""" + text = element.get('text', '') + + for child in element.get('children', []): + text += _extract_text_content(child) + + return text + + +def configure_parser(use_lxml: Optional[bool] = None) -> None: + """ + Configure the global HTML parser. + + Args: + use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None) + """ + global _default_parser + _default_parser = HTMLParser(HTMLParserConfig(use_lxml)) + logger.info(f"HTML parser configured: {'lxml' if _default_parser.config.use_lxml else 'html.parser'}") diff --git a/fastanime/libs/provider/scraping/user_agents.py b/fastanime/libs/provider/scraping/user_agents.py new file mode 100644 index 0000000..c02592c --- /dev/null +++ b/fastanime/libs/provider/scraping/user_agents.py @@ -0,0 +1,238 @@ +""" +User agent utilities for web scraping. + +Provides functionality to generate random user agent strings +to avoid detection and blocking by websites. +""" + +import random +from typing import List, Optional + + +class UserAgentGenerator: + """ + Generator for realistic user agent strings. + + Provides a variety of common user agents from different browsers + and operating systems to help avoid detection. + """ + + # Common user agents for different browsers and OS combinations + USER_AGENTS = [ + # Chrome on Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + + # Chrome on macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + + # Chrome on Linux + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + + # Firefox on Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0", + + # Firefox on macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0", + + # Firefox on Linux + "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0", + + # Safari on macOS + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", + + # Edge on Windows + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0", + + # Mobile Chrome (Android) + "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36", + "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36", + + # Mobile Safari (iOS) + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPad; CPU OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1", + ] + + # Browser-specific user agents for when you need a specific browser + CHROME_USER_AGENTS = [ua for ua in USER_AGENTS if "Chrome" in ua and "Edg" not in ua] + FIREFOX_USER_AGENTS = [ua for ua in USER_AGENTS if "Firefox" in ua] + SAFARI_USER_AGENTS = [ua for ua in USER_AGENTS if "Safari" in ua and "Chrome" not in ua] + EDGE_USER_AGENTS = [ua for ua in USER_AGENTS if "Edg" in ua] + + # Platform-specific user agents + WINDOWS_USER_AGENTS = [ua for ua in USER_AGENTS if "Windows NT" in ua] + MACOS_USER_AGENTS = [ua for ua in USER_AGENTS if "Macintosh" in ua] + LINUX_USER_AGENTS = [ua for ua in USER_AGENTS if "Linux" in ua and "Android" not in ua] + MOBILE_USER_AGENTS = [ua for ua in USER_AGENTS if "Mobile" in ua or "Android" in ua] + + def __init__(self, seed: Optional[int] = None): + """ + Initialize the user agent generator. + + Args: + seed: Random seed for reproducible results (optional) + """ + if seed is not None: + random.seed(seed) + + def random(self) -> str: + """ + Get a random user agent string. + + Returns: + Random user agent string + """ + return random.choice(self.USER_AGENTS) + + def random_browser(self, browser: str) -> str: + """ + Get a random user agent for a specific browser. + + Args: + browser: Browser name ('chrome', 'firefox', 'safari', 'edge') + + Returns: + Random user agent string for the specified browser + + Raises: + ValueError: If browser is not supported + """ + browser = browser.lower() + if browser == 'chrome': + return random.choice(self.CHROME_USER_AGENTS) + elif browser == 'firefox': + return random.choice(self.FIREFOX_USER_AGENTS) + elif browser == 'safari': + return random.choice(self.SAFARI_USER_AGENTS) + elif browser == 'edge': + return random.choice(self.EDGE_USER_AGENTS) + else: + raise ValueError(f"Unsupported browser: {browser}") + + def random_platform(self, platform: str) -> str: + """ + Get a random user agent for a specific platform. + + Args: + platform: Platform name ('windows', 'macos', 'linux', 'mobile') + + Returns: + Random user agent string for the specified platform + + Raises: + ValueError: If platform is not supported + """ + platform = platform.lower() + if platform == 'windows': + return random.choice(self.WINDOWS_USER_AGENTS) + elif platform in ('macos', 'mac'): + return random.choice(self.MACOS_USER_AGENTS) + elif platform == 'linux': + return random.choice(self.LINUX_USER_AGENTS) + elif platform == 'mobile': + return random.choice(self.MOBILE_USER_AGENTS) + else: + raise ValueError(f"Unsupported platform: {platform}") + + def add_user_agent(self, user_agent: str) -> None: + """ + Add a custom user agent to the list. + + Args: + user_agent: Custom user agent string to add + """ + if user_agent not in self.USER_AGENTS: + self.USER_AGENTS.append(user_agent) + + def get_all(self) -> List[str]: + """ + Get all available user agent strings. + + Returns: + List of all user agent strings + """ + return self.USER_AGENTS.copy() + + +# Global instance for convenience +_default_generator = UserAgentGenerator() + + +def random_user_agent() -> str: + """ + Get a random user agent string using the default generator. + + Returns: + Random user agent string + + Examples: + >>> ua = random_user_agent() + >>> "Mozilla" in ua + True + """ + return _default_generator.random() + + +def random_user_agent_browser(browser: str) -> str: + """ + Get a random user agent for a specific browser. + + Args: + browser: Browser name ('chrome', 'firefox', 'safari', 'edge') + + Returns: + Random user agent string for the specified browser + """ + return _default_generator.random_browser(browser) + + +def random_user_agent_platform(platform: str) -> str: + """ + Get a random user agent for a specific platform. + + Args: + platform: Platform name ('windows', 'macos', 'linux', 'mobile') + + Returns: + Random user agent string for the specified platform + """ + return _default_generator.random_platform(platform) + + +def set_user_agent_seed(seed: int) -> None: + """ + Set the random seed for user agent generation. + + Args: + seed: Random seed value + """ + global _default_generator + _default_generator = UserAgentGenerator(seed) + + +def add_custom_user_agent(user_agent: str) -> None: + """ + Add a custom user agent to the default generator. + + Args: + user_agent: Custom user agent string to add + """ + _default_generator.add_user_agent(user_agent) + + +def get_all_user_agents() -> List[str]: + """ + Get all available user agent strings from the default generator. + + Returns: + List of all user agent strings + """ + return _default_generator.get_all() diff --git a/fastanime/libs/provider/scraping/utils.py b/fastanime/libs/provider/scraping/utils.py new file mode 100644 index 0000000..4976e9a --- /dev/null +++ b/fastanime/libs/provider/scraping/utils.py @@ -0,0 +1,264 @@ +""" +Encoding and utility functions for web scraping. + +Provides various encoding utilities including base-N encoding +that was previously sourced from yt-dlp. +""" + +import string +from typing import Union,Optional + + +def encode_base_n(num: int, n: int, table: Optional[str] = None) -> str: + """ + Encode a number in base-n representation. + + Args: + num: The number to encode + n: The base to use for encoding + table: Custom character table (optional) + + Returns: + String representation of the number in base-n + + Examples: + >>> encode_base_n(255, 16) + 'ff' + >>> encode_base_n(42, 36) + '16' + """ + if table is None: + # Default table: 0-9, a-z + table = string.digits + string.ascii_lowercase + + if not 2 <= n <= len(table): + raise ValueError(f"Base must be between 2 and {len(table)}") + + if num == 0: + return table[0] + + result = [] + is_negative = num < 0 + num = abs(num) + + while num > 0: + result.append(table[num % n]) + num //= n + + if is_negative: + result.append('-') + + return ''.join(reversed(result)) + + +def decode_base_n(encoded: str, n: int, table: Optional[str] = None) -> int: + """ + Decode a base-n encoded string back to an integer. + + Args: + encoded: The base-n encoded string + n: The base used for encoding + table: Custom character table (optional) + + Returns: + The decoded integer + + Examples: + >>> decode_base_n('ff', 16) + 255 + >>> decode_base_n('16', 36) + 42 + """ + if table is None: + table = string.digits + string.ascii_lowercase + + if not 2 <= n <= len(table): + raise ValueError(f"Base must be between 2 and {len(table)}") + + if not encoded: + return 0 + + is_negative = encoded.startswith('-') + if is_negative: + encoded = encoded[1:] + + result = 0 + for i, char in enumerate(reversed(encoded.lower())): + if char not in table: + raise ValueError(f"Invalid character '{char}' for base {n}") + + digit_value = table.index(char) + if digit_value >= n: + raise ValueError(f"Invalid digit '{char}' for base {n}") + + result += digit_value * (n ** i) + + return -result if is_negative else result + + +def url_encode(text: str, safe: str = '') -> str: + """ + URL encode a string. + + Args: + text: Text to encode + safe: Characters that should not be encoded + + Returns: + URL encoded string + """ + import urllib.parse + return urllib.parse.quote(text, safe=safe) + + +def url_decode(text: str) -> str: + """ + URL decode a string. + + Args: + text: URL encoded text to decode + + Returns: + Decoded string + """ + import urllib.parse + return urllib.parse.unquote(text) + + +def html_unescape(text: str) -> str: + """ + Unescape HTML entities in text. + + Args: + text: Text containing HTML entities + + Returns: + Text with HTML entities unescaped + + Examples: + >>> html_unescape('"Hello" & <World>') + '"Hello" & ' + """ + import html + return html.unescape(text) + + +def strip_tags(html_content: str) -> str: + """ + Remove all HTML tags from content, leaving only text. + + Args: + html_content: HTML content with tags + + Returns: + Plain text with tags removed + + Examples: + >>> strip_tags('

Hello world!

') + 'Hello world!' + """ + import re + return re.sub(r'<[^>]+>', '', html_content) + + +def normalize_whitespace(text: str) -> str: + """ + Normalize whitespace in text by collapsing multiple spaces and removing leading/trailing whitespace. + + Args: + text: Text to normalize + + Returns: + Text with normalized whitespace + + Examples: + >>> normalize_whitespace(' Hello world \\n\\t ') + 'Hello world' + """ + import re + return re.sub(r'\s+', ' ', text.strip()) + + +def extract_domain(url: str) -> str: + """ + Extract domain from a URL. + + Args: + url: Full URL + + Returns: + Domain portion of the URL + + Examples: + >>> extract_domain('https://example.com/path?query=1') + 'example.com' + """ + import urllib.parse + parsed = urllib.parse.urlparse(url) + return parsed.netloc + + +def join_url(base: str, path: str) -> str: + """ + Join a base URL with a path. + + Args: + base: Base URL + path: Path to join + + Returns: + Combined URL + + Examples: + >>> join_url('https://example.com', '/api/data') + 'https://example.com/api/data' + """ + import urllib.parse + return urllib.parse.urljoin(base, path) + + +def parse_query_string(query: str) -> dict: + """ + Parse a query string into a dictionary. + + Args: + query: Query string (with or without leading '?') + + Returns: + Dictionary of query parameters + + Examples: + >>> parse_query_string('?name=John&age=30') + {'name': ['John'], 'age': ['30']} + """ + import urllib.parse + if query.startswith('?'): + query = query[1:] + return urllib.parse.parse_qs(query) + + +def build_query_string(params: dict) -> str: + """ + Build a query string from a dictionary of parameters. + + Args: + params: Dictionary of parameters + + Returns: + URL-encoded query string + + Examples: + >>> build_query_string({'name': 'John', 'age': 30}) + 'name=John&age=30' + """ + import urllib.parse + + # Handle both single values and lists + normalized_params = {} + for key, value in params.items(): + if isinstance(value, (list, tuple)): + normalized_params[key] = value + else: + normalized_params[key] = [str(value)] + + return urllib.parse.urlencode(normalized_params, doseq=True)