Files
FastAnime/viu_cli/core/utils/fuzzy.py
2025-08-18 01:07:36 +03:00

481 lines
14 KiB
Python

"""
Fuzzy string matching utilities with fallback implementation.
This module provides a fuzzy matching class that uses thefuzz if available,
otherwise falls back to a pure Python implementation with the same API.
Usage:
Basic usage with the convenience functions:
>>> from viu_media.core.utils.fuzzy import fuzz
>>> fuzz.ratio("hello world", "hello")
62
>>> fuzz.partial_ratio("hello world", "hello")
100
Using the FuzzyMatcher class directly:
>>> from viu_media.core.utils.fuzzy import FuzzyMatcher
>>> matcher = FuzzyMatcher()
>>> matcher.backend
'thefuzz' # or 'pure_python' if thefuzz is not available
>>> matcher.token_sort_ratio("fuzzy wuzzy", "wuzzy fuzzy")
100
For drop-in replacement of thefuzz.fuzz:
>>> from viu_media.core.utils.fuzzy import ratio, partial_ratio
>>> ratio("test", "best")
75
"""
import logging
logger = logging.getLogger(__name__)
# Try to import thefuzz, fall back to pure Python implementation
try:
from thefuzz import fuzz as _fuzz_impl
THEFUZZ_AVAILABLE = True
logger.debug("Using thefuzz for fuzzy matching")
except ImportError:
_fuzz_impl = None
THEFUZZ_AVAILABLE = False
logger.debug("thefuzz not available, using fallback implementation")
class _PurePythonFuzz:
"""
Pure Python implementation of fuzzy string matching algorithms.
This provides the same API as thefuzz.fuzz but with pure Python implementations
of the core algorithms.
"""
@staticmethod
def _levenshtein_distance(s1: str, s2: str) -> int:
"""
Calculate the Levenshtein distance between two strings.
Args:
s1: First string
s2: Second string
Returns:
The Levenshtein distance as an integer
"""
if len(s1) < len(s2):
return _PurePythonFuzz._levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
# Cost of insertions, deletions and substitutions
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
@staticmethod
def _longest_common_subsequence(s1: str, s2: str) -> int:
"""
Calculate the length of the longest common subsequence.
Args:
s1: First string
s2: Second string
Returns:
Length of the longest common subsequence
"""
m, n = len(s1), len(s2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if s1[i - 1] == s2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
@staticmethod
def _normalize_string(s: str) -> str:
"""
Normalize a string for comparison by converting to lowercase and stripping whitespace.
Args:
s: String to normalize
Returns:
Normalized string
"""
return s.lower().strip()
@staticmethod
def ratio(s1: str, s2: str) -> int:
"""
Calculate the similarity ratio between two strings using Levenshtein distance.
Args:
s1: First string
s2: Second string
Returns:
Similarity ratio as an integer from 0 to 100
"""
if not s1 and not s2:
return 100
if not s1 or not s2:
return 0
distance = _PurePythonFuzz._levenshtein_distance(s1, s2)
max_len = max(len(s1), len(s2))
if max_len == 0:
return 100
similarity = (max_len - distance) / max_len
return int(similarity * 100)
@staticmethod
def partial_ratio(s1: str, s2: str) -> int:
"""
Calculate the partial similarity ratio between two strings.
This finds the best matching substring and calculates the ratio for that.
Args:
s1: First string
s2: Second string
Returns:
Partial similarity ratio as an integer from 0 to 100
"""
if not s1 or not s2:
return 0
if len(s1) <= len(s2):
shorter, longer = s1, s2
else:
shorter, longer = s2, s1
best_ratio = 0
for i in range(len(longer) - len(shorter) + 1):
substring = longer[i : i + len(shorter)]
ratio = _PurePythonFuzz.ratio(shorter, substring)
best_ratio = max(best_ratio, ratio)
return best_ratio
@staticmethod
def token_sort_ratio(s1: str, s2: str) -> int:
"""
Calculate similarity after sorting tokens in both strings.
Args:
s1: First string
s2: Second string
Returns:
Token sort ratio as an integer from 0 to 100
"""
if not s1 or not s2:
return 0
# Normalize and split into tokens
tokens1 = sorted(_PurePythonFuzz._normalize_string(s1).split())
tokens2 = sorted(_PurePythonFuzz._normalize_string(s2).split())
# Rejoin sorted tokens
sorted_s1 = " ".join(tokens1)
sorted_s2 = " ".join(tokens2)
return _PurePythonFuzz.ratio(sorted_s1, sorted_s2)
@staticmethod
def token_set_ratio(s1: str, s2: str) -> int:
"""
Calculate similarity using set operations on tokens.
Args:
s1: First string
s2: Second string
Returns:
Token set ratio as an integer from 0 to 100
"""
if not s1 or not s2:
return 0
# Normalize and split into tokens
tokens1 = set(_PurePythonFuzz._normalize_string(s1).split())
tokens2 = set(_PurePythonFuzz._normalize_string(s2).split())
# Find intersection and differences
intersection = tokens1 & tokens2
diff1 = tokens1 - tokens2
diff2 = tokens2 - tokens1
# Create sorted strings for comparison
sorted_intersection = " ".join(sorted(intersection))
sorted_diff1 = " ".join(sorted(diff1))
sorted_diff2 = " ".join(sorted(diff2))
# Combine strings for comparison
combined1 = f"{sorted_intersection} {sorted_diff1}".strip()
combined2 = f"{sorted_intersection} {sorted_diff2}".strip()
if not combined1 and not combined2:
return 100
if not combined1 or not combined2:
return 0
return _PurePythonFuzz.ratio(combined1, combined2)
@staticmethod
def partial_token_sort_ratio(s1: str, s2: str) -> int:
"""
Calculate partial similarity after sorting tokens.
Args:
s1: First string
s2: Second string
Returns:
Partial token sort ratio as an integer from 0 to 100
"""
if not s1 or not s2:
return 0
# Normalize and split into tokens
tokens1 = sorted(_PurePythonFuzz._normalize_string(s1).split())
tokens2 = sorted(_PurePythonFuzz._normalize_string(s2).split())
# Rejoin sorted tokens
sorted_s1 = " ".join(tokens1)
sorted_s2 = " ".join(tokens2)
return _PurePythonFuzz.partial_ratio(sorted_s1, sorted_s2)
@staticmethod
def partial_token_set_ratio(s1: str, s2: str) -> int:
"""
Calculate partial similarity using set operations on tokens.
Args:
s1: First string
s2: Second string
Returns:
Partial token set ratio as an integer from 0 to 100
"""
if not s1 or not s2:
return 0
# Normalize and split into tokens
tokens1 = set(_PurePythonFuzz._normalize_string(s1).split())
tokens2 = set(_PurePythonFuzz._normalize_string(s2).split())
# Find intersection and differences
intersection = tokens1 & tokens2
diff1 = tokens1 - tokens2
diff2 = tokens2 - tokens1
# Create sorted strings for comparison
sorted_intersection = " ".join(sorted(intersection))
sorted_diff1 = " ".join(sorted(diff1))
sorted_diff2 = " ".join(sorted(diff2))
# Combine strings for comparison
combined1 = f"{sorted_intersection} {sorted_diff1}".strip()
combined2 = f"{sorted_intersection} {sorted_diff2}".strip()
if not combined1 and not combined2:
return 100
if not combined1 or not combined2:
return 0
return _PurePythonFuzz.partial_ratio(combined1, combined2)
class FuzzyMatcher:
"""
Fuzzy string matching class with the same API as thefuzz.fuzz.
This class automatically uses thefuzz if available, otherwise falls back
to a pure Python implementation.
"""
def __init__(self):
"""Initialize the fuzzy matcher with the appropriate backend."""
if THEFUZZ_AVAILABLE and _fuzz_impl is not None:
self._impl = _fuzz_impl
self._backend = "thefuzz"
else:
self._impl = _PurePythonFuzz
self._backend = "pure_python"
logger.debug(f"FuzzyMatcher initialized with backend: {self._backend}")
@property
def backend(self) -> str:
"""Get the name of the backend being used."""
return self._backend
def ratio(self, s1: str, s2: str) -> int:
"""
Calculate the similarity ratio between two strings.
Args:
s1: First string
s2: Second string
Returns:
Similarity ratio as an integer from 0 to 100
"""
try:
return self._impl.ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in ratio calculation: {e}")
return 0
def partial_ratio(self, s1: str, s2: str) -> int:
"""
Calculate the partial similarity ratio between two strings.
Args:
s1: First string
s2: Second string
Returns:
Partial similarity ratio as an integer from 0 to 100
"""
try:
return self._impl.partial_ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in partial_ratio calculation: {e}")
return 0
def token_sort_ratio(self, s1: str, s2: str) -> int:
"""
Calculate similarity after sorting tokens in both strings.
Args:
s1: First string
s2: Second string
Returns:
Token sort ratio as an integer from 0 to 100
"""
try:
return self._impl.token_sort_ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in token_sort_ratio calculation: {e}")
return 0
def token_set_ratio(self, s1: str, s2: str) -> int:
"""
Calculate similarity using set operations on tokens.
Args:
s1: First string
s2: Second string
Returns:
Token set ratio as an integer from 0 to 100
"""
try:
return self._impl.token_set_ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in token_set_ratio calculation: {e}")
return 0
def partial_token_sort_ratio(self, s1: str, s2: str) -> int:
"""
Calculate partial similarity after sorting tokens.
Args:
s1: First string
s2: Second string
Returns:
Partial token sort ratio as an integer from 0 to 100
"""
try:
return self._impl.partial_token_sort_ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in partial_token_sort_ratio calculation: {e}")
return 0
def partial_token_set_ratio(self, s1: str, s2: str) -> int:
"""
Calculate partial similarity using set operations on tokens.
Args:
s1: First string
s2: Second string
Returns:
Partial token set ratio as an integer from 0 to 100
"""
try:
return self._impl.partial_token_set_ratio(s1, s2)
except Exception as e:
logger.warning(f"Error in partial_token_set_ratio calculation: {e}")
return 0
def best_ratio(self, s1: str, s2: str) -> int:
"""
Get the best ratio from all available methods.
Args:
s1: First string
s2: Second string
Returns:
Best similarity ratio as an integer from 0 to 100
"""
ratios = [
self.ratio(s1, s2),
self.partial_ratio(s1, s2),
self.token_sort_ratio(s1, s2),
self.token_set_ratio(s1, s2),
self.partial_token_sort_ratio(s1, s2),
self.partial_token_set_ratio(s1, s2),
]
return max(ratios)
# Create a default instance for convenience
fuzz = FuzzyMatcher()
# Export the functions for drop-in replacement of thefuzz.fuzz
ratio = fuzz.ratio
partial_ratio = fuzz.partial_ratio
token_sort_ratio = fuzz.token_sort_ratio
token_set_ratio = fuzz.token_set_ratio
partial_token_sort_ratio = fuzz.partial_token_sort_ratio
partial_token_set_ratio = fuzz.partial_token_set_ratio
__all__ = [
"FuzzyMatcher",
"fuzz",
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"THEFUZZ_AVAILABLE",
]