mirror of
https://github.com/Benexl/FastAnime.git
synced 2025-12-12 15:50:01 -08:00
feat: refactor provider imports and enhance HTML parsing utilities
This commit is contained in:
2
fa
2
fa
@@ -3,4 +3,4 @@ provider_type=$1
|
|||||||
provider_name=$2
|
provider_name=$2
|
||||||
[ -z "$provider_type" ] && echo "Please specify provider type" && exit
|
[ -z "$provider_type" ] && echo "Please specify provider type" && exit
|
||||||
[ -z "$provider_name" ] && echo "Please specify provider type" && exit
|
[ -z "$provider_name" ] && echo "Please specify provider type" && exit
|
||||||
uv run python -m fastanime.libs.providers.${provider_type}.${provider_name}.provider
|
uv run python -m fastanime.libs.provider.${provider_type}.${provider_name}.provider
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
def animepahe_key_creator(c: int, a: int):
|
def animepahe_key_creator(c: int, a: int):
|
||||||
from yt_dlp.utils import encode_base_n
|
from ...scraping.utils import encode_base_n
|
||||||
|
|
||||||
if c < a:
|
if c < a:
|
||||||
val_a = ""
|
val_a = ""
|
||||||
@@ -37,17 +37,18 @@ ENCODE_JS_REGEX = re.compile(r"'(.*?);',(\d+),(\d+),'(.*)'\.split")
|
|||||||
|
|
||||||
|
|
||||||
def process_animepahe_embed_page(embed_page: str):
|
def process_animepahe_embed_page(embed_page: str):
|
||||||
from yt_dlp.utils import get_element_text_and_html_by_tag
|
from ...scraping.html_parser import get_element_text_and_html_by_tag
|
||||||
|
|
||||||
encoded_js_string = ""
|
encoded_js_string = ""
|
||||||
embed_page_content = embed_page
|
embed_page_content = embed_page
|
||||||
for _ in range(8):
|
for _ in range(8):
|
||||||
text, html = get_element_text_and_html_by_tag("script", embed_page_content)
|
text, html = get_element_text_and_html_by_tag("script", embed_page_content)
|
||||||
if not text:
|
if not text and html:
|
||||||
embed_page_content = re.sub(html, "", embed_page_content)
|
embed_page_content = re.sub(html, "", embed_page_content)
|
||||||
continue
|
continue
|
||||||
encoded_js_string = text.strip()
|
if text:
|
||||||
break
|
encoded_js_string = text.strip()
|
||||||
|
break
|
||||||
if not encoded_js_string:
|
if not encoded_js_string:
|
||||||
return
|
return
|
||||||
obsfucated_js_parameter_match = PARAMETERS_REGEX.search(encoded_js_string)
|
obsfucated_js_parameter_match = PARAMETERS_REGEX.search(encoded_js_string)
|
||||||
|
|||||||
@@ -106,8 +106,7 @@ class AnimePahe(BaseAnimeProvider):
|
|||||||
|
|
||||||
@debug_provider
|
@debug_provider
|
||||||
def episode_streams(self, params: EpisodeStreamsParams) -> Iterator[Server] | None:
|
def episode_streams(self, params: EpisodeStreamsParams) -> Iterator[Server] | None:
|
||||||
# TODO: replace with custom implementations using default html parser or lxml
|
from ...scraping.html_parser import (
|
||||||
from yt_dlp.utils import (
|
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
get_element_by_id,
|
get_element_by_id,
|
||||||
get_elements_html_by_class,
|
get_elements_html_by_class,
|
||||||
@@ -125,6 +124,9 @@ class AnimePahe(BaseAnimeProvider):
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
c = get_element_by_id("resolutionMenu", response.text)
|
c = get_element_by_id("resolutionMenu", response.text)
|
||||||
|
if not c:
|
||||||
|
logger.error("Resolution menu not found in the response")
|
||||||
|
return
|
||||||
resolutionMenuItems = get_elements_html_by_class("dropdown-item", c)
|
resolutionMenuItems = get_elements_html_by_class("dropdown-item", c)
|
||||||
res_dicts = [extract_attributes(item) for item in resolutionMenuItems]
|
res_dicts = [extract_attributes(item) for item in resolutionMenuItems]
|
||||||
quality = None
|
quality = None
|
||||||
@@ -133,8 +135,9 @@ class AnimePahe(BaseAnimeProvider):
|
|||||||
|
|
||||||
# TODO: better document the scraping process
|
# TODO: better document the scraping process
|
||||||
for res_dict in res_dicts:
|
for res_dict in res_dicts:
|
||||||
embed_url = res_dict["data-src"]
|
# the actual attributes are data attributes in the original html 'prefixed with data-'
|
||||||
data_audio = "dub" if res_dict["data-audio"] == "eng" else "sub"
|
embed_url = res_dict["src"]
|
||||||
|
data_audio = "dub" if res_dict["audio"] == "eng" else "sub"
|
||||||
|
|
||||||
if data_audio != params.translation_type:
|
if data_audio != params.translation_type:
|
||||||
continue
|
continue
|
||||||
@@ -162,7 +165,7 @@ class AnimePahe(BaseAnimeProvider):
|
|||||||
logger.error("failed to find juicy stream")
|
logger.error("failed to find juicy stream")
|
||||||
continue
|
continue
|
||||||
juicy_stream = juicy_stream.group(1)
|
juicy_stream = juicy_stream.group(1)
|
||||||
quality = res_dict["data-resolution"]
|
quality = res_dict["resolution"]
|
||||||
translation_type = data_audio
|
translation_type = data_audio
|
||||||
stream_link = juicy_stream
|
stream_link = juicy_stream
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ def test_anime_provider(AnimeProvider: Type[BaseAnimeProvider]):
|
|||||||
anime_provider = AnimeProvider(
|
anime_provider = AnimeProvider(
|
||||||
Client(headers={"User-Agent": random_user_agent(), **AnimeProvider.HEADERS})
|
Client(headers={"User-Agent": random_user_agent(), **AnimeProvider.HEADERS})
|
||||||
)
|
)
|
||||||
print(APP_ASCII_ART)
|
print(APP_ASCII_ART.read_text(encoding="utf-8"))
|
||||||
query = input("What anime would you like to stream: ")
|
query = input("What anime would you like to stream: ")
|
||||||
search_results = anime_provider.search(SearchParams(query=query))
|
search_results = anime_provider.search(SearchParams(query=query))
|
||||||
if not search_results:
|
if not search_results:
|
||||||
|
|||||||
0
fastanime/libs/provider/scraping/__init__.py
Normal file
0
fastanime/libs/provider/scraping/__init__.py
Normal file
474
fastanime/libs/provider/scraping/html_parser.py
Normal file
474
fastanime/libs/provider/scraping/html_parser.py
Normal file
@@ -0,0 +1,474 @@
|
|||||||
|
"""
|
||||||
|
HTML parsing utilities with optional lxml support.
|
||||||
|
|
||||||
|
This module provides comprehensive HTML parsing capabilities using either
|
||||||
|
Python's built-in html.parser or lxml for better performance when available.
|
||||||
|
"""
|
||||||
|
# TODO: Review and optimize the HTML parsing logic for better performance and flexibility.
|
||||||
|
# Consider adding more utility functions for common HTML manipulation tasks.
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from html.parser import HTMLParser as BaseHTMLParser
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import lxml
|
||||||
|
HAS_LXML = False
|
||||||
|
try:
|
||||||
|
from lxml import etree, html as lxml_html
|
||||||
|
HAS_LXML = True
|
||||||
|
logger.debug("lxml is available and will be used for HTML parsing")
|
||||||
|
except ImportError:
|
||||||
|
logger.debug("lxml not available, falling back to html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParserConfig:
|
||||||
|
"""Configuration for HTML parser selection."""
|
||||||
|
|
||||||
|
def __init__(self, use_lxml: Optional[bool] = None):
|
||||||
|
"""
|
||||||
|
Initialize parser configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
|
||||||
|
"""
|
||||||
|
if use_lxml is None:
|
||||||
|
self.use_lxml = HAS_LXML
|
||||||
|
else:
|
||||||
|
self.use_lxml = use_lxml and HAS_LXML
|
||||||
|
|
||||||
|
if use_lxml and not HAS_LXML:
|
||||||
|
logger.warning("lxml requested but not available, falling back to html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParser:
|
||||||
|
"""
|
||||||
|
Comprehensive HTML parser with optional lxml support.
|
||||||
|
|
||||||
|
Provides a unified interface for HTML parsing operations regardless
|
||||||
|
of the underlying parser implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[HTMLParserConfig] = None):
|
||||||
|
"""Initialize the HTML parser with configuration."""
|
||||||
|
self.config = config or HTMLParserConfig()
|
||||||
|
|
||||||
|
def parse(self, html_content: str) -> Union[etree._Element, 'ParsedHTML']:
|
||||||
|
"""
|
||||||
|
Parse HTML content and return a parsed tree.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: Raw HTML string to parse
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Parsed HTML tree (lxml Element or custom ParsedHTML object)
|
||||||
|
"""
|
||||||
|
if self.config.use_lxml:
|
||||||
|
return self._parse_with_lxml(html_content)
|
||||||
|
else:
|
||||||
|
return self._parse_with_builtin(html_content)
|
||||||
|
|
||||||
|
def _parse_with_lxml(self, html_content: str) -> etree._Element:
|
||||||
|
"""Parse HTML using lxml."""
|
||||||
|
try:
|
||||||
|
# Use lxml's HTML parser which is more lenient
|
||||||
|
return lxml_html.fromstring(html_content)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml parsing failed: {e}, falling back to html.parser")
|
||||||
|
return self._parse_with_builtin(html_content)
|
||||||
|
|
||||||
|
def _parse_with_builtin(self, html_content: str) -> 'ParsedHTML':
|
||||||
|
"""Parse HTML using Python's built-in parser."""
|
||||||
|
parser = BuiltinHTMLParser()
|
||||||
|
parser.feed(html_content)
|
||||||
|
return ParsedHTML(parser.elements, html_content)
|
||||||
|
|
||||||
|
|
||||||
|
class BuiltinHTMLParser(BaseHTMLParser):
|
||||||
|
"""Enhanced HTML parser using Python's built-in capabilities."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.elements = []
|
||||||
|
self.current_element = None
|
||||||
|
self.element_stack = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||||
|
"""Handle opening tags."""
|
||||||
|
element = {
|
||||||
|
'tag': tag,
|
||||||
|
'attrs': dict(attrs),
|
||||||
|
'text': '',
|
||||||
|
'children': [],
|
||||||
|
'start_pos': self.getpos(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.element_stack:
|
||||||
|
self.element_stack[-1]['children'].append(element)
|
||||||
|
else:
|
||||||
|
self.elements.append(element)
|
||||||
|
|
||||||
|
self.element_stack.append(element)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str):
|
||||||
|
"""Handle closing tags."""
|
||||||
|
if self.element_stack and self.element_stack[-1]['tag'] == tag:
|
||||||
|
element = self.element_stack.pop()
|
||||||
|
element['end_pos'] = self.getpos()
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
"""Handle text content."""
|
||||||
|
if self.element_stack:
|
||||||
|
self.element_stack[-1]['text'] += data
|
||||||
|
|
||||||
|
|
||||||
|
class ParsedHTML:
|
||||||
|
"""Wrapper for parsed HTML using built-in parser."""
|
||||||
|
|
||||||
|
def __init__(self, elements: List[Dict], raw_html: str):
|
||||||
|
self.elements = elements
|
||||||
|
self.raw_html = raw_html
|
||||||
|
|
||||||
|
def find_by_id(self, element_id: str) -> Optional[Dict]:
|
||||||
|
"""Find element by ID."""
|
||||||
|
return self._find_recursive(self.elements, lambda el: el['attrs'].get('id') == element_id)
|
||||||
|
|
||||||
|
def find_by_class(self, class_name: str) -> List[Dict]:
|
||||||
|
"""Find elements by class name."""
|
||||||
|
results = []
|
||||||
|
self._find_all_recursive(
|
||||||
|
self.elements,
|
||||||
|
lambda el: class_name in el['attrs'].get('class', '').split(),
|
||||||
|
results
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def find_by_tag(self, tag_name: str) -> List[Dict]:
|
||||||
|
"""Find elements by tag name."""
|
||||||
|
results = []
|
||||||
|
self._find_all_recursive(
|
||||||
|
self.elements,
|
||||||
|
lambda el: el['tag'].lower() == tag_name.lower(),
|
||||||
|
results
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _find_recursive(self, elements: List[Dict], condition) -> Optional[Dict]:
|
||||||
|
"""Recursively find first element matching condition."""
|
||||||
|
for element in elements:
|
||||||
|
if condition(element):
|
||||||
|
return element
|
||||||
|
result = self._find_recursive(element['children'], condition)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_all_recursive(self, elements: List[Dict], condition, results: List[Dict]):
|
||||||
|
"""Recursively find all elements matching condition."""
|
||||||
|
for element in elements:
|
||||||
|
if condition(element):
|
||||||
|
results.append(element)
|
||||||
|
self._find_all_recursive(element['children'], condition, results)
|
||||||
|
|
||||||
|
|
||||||
|
# Global parser instance
|
||||||
|
_default_parser = HTMLParser()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_attributes(html_element: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Extract attributes from an HTML element string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_element: HTML element as string (e.g., '<div class="test" id="main">')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of attribute name-value pairs
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> extract_attributes('<div class="test" id="main">')
|
||||||
|
{'class': 'test', 'id': 'main'}
|
||||||
|
"""
|
||||||
|
if not html_element:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Use regex to extract attributes from HTML string
|
||||||
|
attr_pattern = r'(\w+)=(["\'])([^"\']*?)\2'
|
||||||
|
matches = re.findall(attr_pattern, html_element)
|
||||||
|
|
||||||
|
attributes = {}
|
||||||
|
for match in matches:
|
||||||
|
attr_name, _, attr_value = match
|
||||||
|
attributes[attr_name] = attr_value
|
||||||
|
|
||||||
|
# Handle attributes without quotes
|
||||||
|
unquoted_pattern = r'(\w+)=([^\s>]+)'
|
||||||
|
unquoted_matches = re.findall(unquoted_pattern, html_element)
|
||||||
|
for attr_name, attr_value in unquoted_matches:
|
||||||
|
if attr_name not in attributes:
|
||||||
|
attributes[attr_name] = attr_value
|
||||||
|
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_by_id(element_id: str, html_content: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get HTML element by ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element_id: The ID attribute value to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTML string of the element or None if not found
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> html = '<div id="test">Content</div>'
|
||||||
|
>>> get_element_by_id("test", html)
|
||||||
|
'<div id="test">Content</div>'
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
element = parsed.xpath(f'//*[@id="{element_id}"]')
|
||||||
|
if element:
|
||||||
|
return etree.tostring(element[0], encoding='unicode', method='html')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
element = parsed.find_by_id(element_id)
|
||||||
|
if element:
|
||||||
|
return _element_to_html(element, html_content)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_by_tag(tag_name: str, html_content: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get first HTML element by tag name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag_name: The tag name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTML string of the element or None if not found
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
elements = parsed.xpath(f'//{tag_name}')
|
||||||
|
if elements:
|
||||||
|
return etree.tostring(elements[0], encoding='unicode', method='html')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
elements = parsed.find_by_tag(tag_name)
|
||||||
|
if elements:
|
||||||
|
return _element_to_html(elements[0], html_content)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_by_class(class_name: str, html_content: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get first HTML element by class name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
class_name: The class name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTML string of the element or None if not found
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
|
||||||
|
if elements:
|
||||||
|
return etree.tostring(elements[0], encoding='unicode', method='html')
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
elements = parsed.find_by_class(class_name)
|
||||||
|
if elements:
|
||||||
|
return _element_to_html(elements[0], html_content)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_by_tag(tag_name: str, html_content: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get all HTML elements by tag name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag_name: The tag name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of HTML strings for matching elements
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
elements = parsed.xpath(f'//{tag_name}')
|
||||||
|
for element in elements:
|
||||||
|
results.append(etree.tostring(element, encoding='unicode', method='html'))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
else:
|
||||||
|
elements = parsed.find_by_tag(tag_name)
|
||||||
|
for element in elements:
|
||||||
|
results.append(_element_to_html(element, html_content))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_by_class(class_name: str, html_content: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get all HTML elements by class name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
class_name: The class name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of HTML strings for matching elements
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
elements = parsed.xpath(f'//*[contains(@class, "{class_name}")]')
|
||||||
|
for element in elements:
|
||||||
|
results.append(etree.tostring(element, encoding='unicode', method='html'))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
else:
|
||||||
|
elements = parsed.find_by_class(class_name)
|
||||||
|
for element in elements:
|
||||||
|
results.append(_element_to_html(element, html_content))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_html_by_class(class_name: str, html_content: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get HTML strings of elements by class name.
|
||||||
|
|
||||||
|
This is an alias for get_elements_by_class for yt-dlp compatibility.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
class_name: The class name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of HTML strings for matching elements
|
||||||
|
"""
|
||||||
|
return get_elements_by_class(class_name, html_content)
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_text_and_html_by_tag(tag_name: str, html_content: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Get both text content and HTML of first element by tag name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag_name: The tag name to search for
|
||||||
|
html_content: HTML content to search in
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (text_content, html_string) or (None, None) if not found
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> html = '<script>alert("test");</script>'
|
||||||
|
>>> get_element_text_and_html_by_tag("script", html)
|
||||||
|
('alert("test");', '<script>alert("test");</script>')
|
||||||
|
"""
|
||||||
|
parsed = _default_parser.parse(html_content)
|
||||||
|
|
||||||
|
if _default_parser.config.use_lxml:
|
||||||
|
try:
|
||||||
|
elements = parsed.xpath(f'//{tag_name}')
|
||||||
|
if elements:
|
||||||
|
element = elements[0]
|
||||||
|
text = element.text_content() if hasattr(element, 'text_content') else (element.text or '')
|
||||||
|
html_str = etree.tostring(element, encoding='unicode', method='html')
|
||||||
|
return text, html_str
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"lxml XPath search failed: {e}")
|
||||||
|
return None, None
|
||||||
|
else:
|
||||||
|
elements = parsed.find_by_tag(tag_name)
|
||||||
|
if elements:
|
||||||
|
element = elements[0]
|
||||||
|
text = _extract_text_content(element)
|
||||||
|
html_str = _element_to_html(element, html_content)
|
||||||
|
return text, html_str
|
||||||
|
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def _element_to_html(element: Dict, original_html: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert parsed element back to HTML string.
|
||||||
|
|
||||||
|
This is a simplified implementation that reconstructs HTML from parsed data.
|
||||||
|
For production use, consider using lxml for better accuracy.
|
||||||
|
"""
|
||||||
|
if not element:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Build opening tag
|
||||||
|
tag = element['tag']
|
||||||
|
attrs = element.get('attrs', {})
|
||||||
|
attr_str = ' '.join(f'{k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||||
|
|
||||||
|
if attr_str:
|
||||||
|
opening_tag = f"<{tag} {attr_str}>"
|
||||||
|
else:
|
||||||
|
opening_tag = f"<{tag}>"
|
||||||
|
|
||||||
|
# Add text content
|
||||||
|
text = element.get('text', '')
|
||||||
|
|
||||||
|
# Add children
|
||||||
|
children_html = ""
|
||||||
|
for child in element.get('children', []):
|
||||||
|
children_html += _element_to_html(child, original_html)
|
||||||
|
|
||||||
|
# Build closing tag
|
||||||
|
closing_tag = f"</{tag}>"
|
||||||
|
|
||||||
|
return f"{opening_tag}{text}{children_html}{closing_tag}"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_content(element: Dict) -> str:
|
||||||
|
"""Extract all text content from element and its children."""
|
||||||
|
text = element.get('text', '')
|
||||||
|
|
||||||
|
for child in element.get('children', []):
|
||||||
|
text += _extract_text_content(child)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def configure_parser(use_lxml: Optional[bool] = None) -> None:
|
||||||
|
"""
|
||||||
|
Configure the global HTML parser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
use_lxml: Force use of lxml (True), html.parser (False), or auto-detect (None)
|
||||||
|
"""
|
||||||
|
global _default_parser
|
||||||
|
_default_parser = HTMLParser(HTMLParserConfig(use_lxml))
|
||||||
|
logger.info(f"HTML parser configured: {'lxml' if _default_parser.config.use_lxml else 'html.parser'}")
|
||||||
238
fastanime/libs/provider/scraping/user_agents.py
Normal file
238
fastanime/libs/provider/scraping/user_agents.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""
|
||||||
|
User agent utilities for web scraping.
|
||||||
|
|
||||||
|
Provides functionality to generate random user agent strings
|
||||||
|
to avoid detection and blocking by websites.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class UserAgentGenerator:
|
||||||
|
"""
|
||||||
|
Generator for realistic user agent strings.
|
||||||
|
|
||||||
|
Provides a variety of common user agents from different browsers
|
||||||
|
and operating systems to help avoid detection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Common user agents for different browsers and OS combinations
|
||||||
|
USER_AGENTS = [
|
||||||
|
# Chrome on Windows
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
|
||||||
|
# Chrome on macOS
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
|
||||||
|
# Chrome on Linux
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||||
|
|
||||||
|
# Firefox on Windows
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||||
|
|
||||||
|
# Firefox on macOS
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||||
|
|
||||||
|
# Firefox on Linux
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||||
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||||
|
|
||||||
|
# Safari on macOS
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||||||
|
|
||||||
|
# Edge on Windows
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
|
||||||
|
|
||||||
|
# Mobile Chrome (Android)
|
||||||
|
"Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
|
||||||
|
|
||||||
|
# Mobile Safari (iOS)
|
||||||
|
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
|
||||||
|
"Mozilla/5.0 (iPad; CPU OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Browser-specific user agents for when you need a specific browser
|
||||||
|
CHROME_USER_AGENTS = [ua for ua in USER_AGENTS if "Chrome" in ua and "Edg" not in ua]
|
||||||
|
FIREFOX_USER_AGENTS = [ua for ua in USER_AGENTS if "Firefox" in ua]
|
||||||
|
SAFARI_USER_AGENTS = [ua for ua in USER_AGENTS if "Safari" in ua and "Chrome" not in ua]
|
||||||
|
EDGE_USER_AGENTS = [ua for ua in USER_AGENTS if "Edg" in ua]
|
||||||
|
|
||||||
|
# Platform-specific user agents
|
||||||
|
WINDOWS_USER_AGENTS = [ua for ua in USER_AGENTS if "Windows NT" in ua]
|
||||||
|
MACOS_USER_AGENTS = [ua for ua in USER_AGENTS if "Macintosh" in ua]
|
||||||
|
LINUX_USER_AGENTS = [ua for ua in USER_AGENTS if "Linux" in ua and "Android" not in ua]
|
||||||
|
MOBILE_USER_AGENTS = [ua for ua in USER_AGENTS if "Mobile" in ua or "Android" in ua]
|
||||||
|
|
||||||
|
def __init__(self, seed: Optional[int] = None):
|
||||||
|
"""
|
||||||
|
Initialize the user agent generator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed: Random seed for reproducible results (optional)
|
||||||
|
"""
|
||||||
|
if seed is not None:
|
||||||
|
random.seed(seed)
|
||||||
|
|
||||||
|
def random(self) -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string
|
||||||
|
"""
|
||||||
|
return random.choice(self.USER_AGENTS)
|
||||||
|
|
||||||
|
def random_browser(self, browser: str) -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent for a specific browser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string for the specified browser
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If browser is not supported
|
||||||
|
"""
|
||||||
|
browser = browser.lower()
|
||||||
|
if browser == 'chrome':
|
||||||
|
return random.choice(self.CHROME_USER_AGENTS)
|
||||||
|
elif browser == 'firefox':
|
||||||
|
return random.choice(self.FIREFOX_USER_AGENTS)
|
||||||
|
elif browser == 'safari':
|
||||||
|
return random.choice(self.SAFARI_USER_AGENTS)
|
||||||
|
elif browser == 'edge':
|
||||||
|
return random.choice(self.EDGE_USER_AGENTS)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported browser: {browser}")
|
||||||
|
|
||||||
|
def random_platform(self, platform: str) -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent for a specific platform.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string for the specified platform
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If platform is not supported
|
||||||
|
"""
|
||||||
|
platform = platform.lower()
|
||||||
|
if platform == 'windows':
|
||||||
|
return random.choice(self.WINDOWS_USER_AGENTS)
|
||||||
|
elif platform in ('macos', 'mac'):
|
||||||
|
return random.choice(self.MACOS_USER_AGENTS)
|
||||||
|
elif platform == 'linux':
|
||||||
|
return random.choice(self.LINUX_USER_AGENTS)
|
||||||
|
elif platform == 'mobile':
|
||||||
|
return random.choice(self.MOBILE_USER_AGENTS)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported platform: {platform}")
|
||||||
|
|
||||||
|
def add_user_agent(self, user_agent: str) -> None:
|
||||||
|
"""
|
||||||
|
Add a custom user agent to the list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_agent: Custom user agent string to add
|
||||||
|
"""
|
||||||
|
if user_agent not in self.USER_AGENTS:
|
||||||
|
self.USER_AGENTS.append(user_agent)
|
||||||
|
|
||||||
|
def get_all(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get all available user agent strings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all user agent strings
|
||||||
|
"""
|
||||||
|
return self.USER_AGENTS.copy()
|
||||||
|
|
||||||
|
|
||||||
|
# Global instance for convenience
|
||||||
|
_default_generator = UserAgentGenerator()
|
||||||
|
|
||||||
|
|
||||||
|
def random_user_agent() -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent string using the default generator.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> ua = random_user_agent()
|
||||||
|
>>> "Mozilla" in ua
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
return _default_generator.random()
|
||||||
|
|
||||||
|
|
||||||
|
def random_user_agent_browser(browser: str) -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent for a specific browser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
browser: Browser name ('chrome', 'firefox', 'safari', 'edge')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string for the specified browser
|
||||||
|
"""
|
||||||
|
return _default_generator.random_browser(browser)
|
||||||
|
|
||||||
|
|
||||||
|
def random_user_agent_platform(platform: str) -> str:
|
||||||
|
"""
|
||||||
|
Get a random user agent for a specific platform.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform: Platform name ('windows', 'macos', 'linux', 'mobile')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random user agent string for the specified platform
|
||||||
|
"""
|
||||||
|
return _default_generator.random_platform(platform)
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_agent_seed(seed: int) -> None:
|
||||||
|
"""
|
||||||
|
Set the random seed for user agent generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed: Random seed value
|
||||||
|
"""
|
||||||
|
global _default_generator
|
||||||
|
_default_generator = UserAgentGenerator(seed)
|
||||||
|
|
||||||
|
|
||||||
|
def add_custom_user_agent(user_agent: str) -> None:
|
||||||
|
"""
|
||||||
|
Add a custom user agent to the default generator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_agent: Custom user agent string to add
|
||||||
|
"""
|
||||||
|
_default_generator.add_user_agent(user_agent)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_user_agents() -> List[str]:
|
||||||
|
"""
|
||||||
|
Get all available user agent strings from the default generator.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all user agent strings
|
||||||
|
"""
|
||||||
|
return _default_generator.get_all()
|
||||||
264
fastanime/libs/provider/scraping/utils.py
Normal file
264
fastanime/libs/provider/scraping/utils.py
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
"""
|
||||||
|
Encoding and utility functions for web scraping.
|
||||||
|
|
||||||
|
Provides various encoding utilities including base-N encoding
|
||||||
|
that was previously sourced from yt-dlp.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import string
|
||||||
|
from typing import Union,Optional
|
||||||
|
|
||||||
|
|
||||||
|
def encode_base_n(num: int, n: int, table: Optional[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
Encode a number in base-n representation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num: The number to encode
|
||||||
|
n: The base to use for encoding
|
||||||
|
table: Custom character table (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String representation of the number in base-n
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> encode_base_n(255, 16)
|
||||||
|
'ff'
|
||||||
|
>>> encode_base_n(42, 36)
|
||||||
|
'16'
|
||||||
|
"""
|
||||||
|
if table is None:
|
||||||
|
# Default table: 0-9, a-z
|
||||||
|
table = string.digits + string.ascii_lowercase
|
||||||
|
|
||||||
|
if not 2 <= n <= len(table):
|
||||||
|
raise ValueError(f"Base must be between 2 and {len(table)}")
|
||||||
|
|
||||||
|
if num == 0:
|
||||||
|
return table[0]
|
||||||
|
|
||||||
|
result = []
|
||||||
|
is_negative = num < 0
|
||||||
|
num = abs(num)
|
||||||
|
|
||||||
|
while num > 0:
|
||||||
|
result.append(table[num % n])
|
||||||
|
num //= n
|
||||||
|
|
||||||
|
if is_negative:
|
||||||
|
result.append('-')
|
||||||
|
|
||||||
|
return ''.join(reversed(result))
|
||||||
|
|
||||||
|
|
||||||
|
def decode_base_n(encoded: str, n: int, table: Optional[str] = None) -> int:
|
||||||
|
"""
|
||||||
|
Decode a base-n encoded string back to an integer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
encoded: The base-n encoded string
|
||||||
|
n: The base used for encoding
|
||||||
|
table: Custom character table (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The decoded integer
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> decode_base_n('ff', 16)
|
||||||
|
255
|
||||||
|
>>> decode_base_n('16', 36)
|
||||||
|
42
|
||||||
|
"""
|
||||||
|
if table is None:
|
||||||
|
table = string.digits + string.ascii_lowercase
|
||||||
|
|
||||||
|
if not 2 <= n <= len(table):
|
||||||
|
raise ValueError(f"Base must be between 2 and {len(table)}")
|
||||||
|
|
||||||
|
if not encoded:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
is_negative = encoded.startswith('-')
|
||||||
|
if is_negative:
|
||||||
|
encoded = encoded[1:]
|
||||||
|
|
||||||
|
result = 0
|
||||||
|
for i, char in enumerate(reversed(encoded.lower())):
|
||||||
|
if char not in table:
|
||||||
|
raise ValueError(f"Invalid character '{char}' for base {n}")
|
||||||
|
|
||||||
|
digit_value = table.index(char)
|
||||||
|
if digit_value >= n:
|
||||||
|
raise ValueError(f"Invalid digit '{char}' for base {n}")
|
||||||
|
|
||||||
|
result += digit_value * (n ** i)
|
||||||
|
|
||||||
|
return -result if is_negative else result
|
||||||
|
|
||||||
|
|
||||||
|
def url_encode(text: str, safe: str = '') -> str:
|
||||||
|
"""
|
||||||
|
URL encode a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to encode
|
||||||
|
safe: Characters that should not be encoded
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
URL encoded string
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
return urllib.parse.quote(text, safe=safe)
|
||||||
|
|
||||||
|
|
||||||
|
def url_decode(text: str) -> str:
|
||||||
|
"""
|
||||||
|
URL decode a string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: URL encoded text to decode
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded string
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
return urllib.parse.unquote(text)
|
||||||
|
|
||||||
|
|
||||||
|
def html_unescape(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Unescape HTML entities in text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text containing HTML entities
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text with HTML entities unescaped
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> html_unescape('"Hello" & <World>')
|
||||||
|
'"Hello" & <World>'
|
||||||
|
"""
|
||||||
|
import html
|
||||||
|
return html.unescape(text)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_tags(html_content: str) -> str:
|
||||||
|
"""
|
||||||
|
Remove all HTML tags from content, leaving only text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: HTML content with tags
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Plain text with tags removed
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> strip_tags('<p>Hello <b>world</b>!</p>')
|
||||||
|
'Hello world!'
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
return re.sub(r'<[^>]+>', '', html_content)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_whitespace(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize whitespace in text by collapsing multiple spaces and removing leading/trailing whitespace.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to normalize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text with normalized whitespace
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> normalize_whitespace(' Hello world \\n\\t ')
|
||||||
|
'Hello world'
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
return re.sub(r'\s+', ' ', text.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def extract_domain(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract domain from a URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Full URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Domain portion of the URL
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> extract_domain('https://example.com/path?query=1')
|
||||||
|
'example.com'
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
parsed = urllib.parse.urlparse(url)
|
||||||
|
return parsed.netloc
|
||||||
|
|
||||||
|
|
||||||
|
def join_url(base: str, path: str) -> str:
|
||||||
|
"""
|
||||||
|
Join a base URL with a path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base: Base URL
|
||||||
|
path: Path to join
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Combined URL
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> join_url('https://example.com', '/api/data')
|
||||||
|
'https://example.com/api/data'
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
return urllib.parse.urljoin(base, path)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_query_string(query: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse a query string into a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Query string (with or without leading '?')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of query parameters
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> parse_query_string('?name=John&age=30')
|
||||||
|
{'name': ['John'], 'age': ['30']}
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
if query.startswith('?'):
|
||||||
|
query = query[1:]
|
||||||
|
return urllib.parse.parse_qs(query)
|
||||||
|
|
||||||
|
|
||||||
|
def build_query_string(params: dict) -> str:
|
||||||
|
"""
|
||||||
|
Build a query string from a dictionary of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: Dictionary of parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
URL-encoded query string
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> build_query_string({'name': 'John', 'age': 30})
|
||||||
|
'name=John&age=30'
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
# Handle both single values and lists
|
||||||
|
normalized_params = {}
|
||||||
|
for key, value in params.items():
|
||||||
|
if isinstance(value, (list, tuple)):
|
||||||
|
normalized_params[key] = value
|
||||||
|
else:
|
||||||
|
normalized_params[key] = [str(value)]
|
||||||
|
|
||||||
|
return urllib.parse.urlencode(normalized_params, doseq=True)
|
||||||
Reference in New Issue
Block a user