# MIT License
#
# Copyright (c) 2025 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
"""
.. module:: helpers
:synopsis: Helper functions
"""
import re
import sys
import warnings
from difflib import SequenceMatcher
from typing import Optional
def _rename_args(kwargs_map: dict[str, str], version: str):
def decorator(func):
def wrapped(*args, **kwargs):
new_kwargs = {}
for k, v in kwargs.items():
if k in kwargs_map:
warnings.warn(
f"Keyword argument '{k}' is deprecated and will be removed in {version}. Use '{kwargs_map[k]}' instead.",
DeprecationWarning,
)
new_kwargs[kwargs_map.get(k, k)] = v
return func(*args, **new_kwargs)
return wrapped
return decorator
[docs]
def case_transfer_matching(cased_text: str, uncased_text: str) -> str:
"""Transfers the casing from one text to another - assuming that they are
'matching' texts, alias they have the same length.
Args:
cased_text: Text with varied casing.
uncased_text: Text that is in lowercase only.
Returns:
Text with the content of `uncased_text` and the casing of `cased_text`.
Raises:
ValueError: If the input texts have different lengths.
"""
if len(cased_text) != len(uncased_text):
raise ValueError(
"'cased_text' and 'uncased_text' don't have the same length, use case_transfer_similar() instead"
)
return "".join(
[
y.upper() if x.isupper() else y.lower()
for x, y in zip(cased_text, uncased_text)
]
)
[docs]
def case_transfer_similar(cased_text: str, uncased_text: str) -> str:
"""Transfers the casing from one text to another - for similar (not matching)
text.
Use `difflib.SequenceMatcher` to identify the different type of changes
needed to turn `cased_text` into `uncased_text`.
- For inserted sections: transfer the casing from the prior character. If no
character before or the character before is the space, transfer the casing
from the following character.
- For deleted sections: no case transfer is required.
- For equal sections: swap out the text with the original, the cased one, a
otherwise the two are the same.
- For replaced sections: transfer the casing using
:meth:`case_transfer_matching` if the two has the same length, otherwise
transfer character-by-character and carry the last casing over to any
additional characters.
Args:
cased_text: Text with varied casing.
uncased_text: Text in lowercase.
Returns:
Text with the content of `uncased_text` but the casing of `cased_text`.
Raises:
ValueError: If `cased_text` is empty.
"""
if not uncased_text:
return uncased_text
if not cased_text:
raise ValueError("'cased_text' cannot be empty")
matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text)
result = ""
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "delete":
continue
if tag == "insert":
# For the first character or space on the left, take the casing from
# the following character. Else take case the prior character
ia_ref = i1 if i1 == 0 or cased_text[i1 - 1] == " " else i1 - 1
if cased_text[ia_ref].isupper():
result += uncased_text[j1:j2].upper()
else:
result += uncased_text[j1:j2].lower()
elif tag == "equal":
# Transfer the text from the cased_text, as anyhow they are equal
# (without the casing)
result += cased_text[i1:i2]
else:
cased_seq = cased_text[i1:i2]
uncased_seq = uncased_text[j1:j2]
if len(cased_seq) == len(uncased_seq):
result += case_transfer_matching(cased_seq, uncased_seq)
else:
# transfer the casing character-by-character and using the last
# casing to continue if we run out of the sequence
for cased, uncased in zip(cased_seq, uncased_seq):
result += uncased.upper() if cased.isupper() else uncased.lower()
# Apply casing from the last character of cased_seq to the rest
# of the uncased_seq
if len(cased_seq) < len(uncased_seq):
upper = cased_seq[-1].isupper()
idx = len(cased_seq)
result += "".join(
map(str.upper if upper else str.lower, uncased_seq[idx:])
)
return result
[docs]
def increment_count(count: int, count_previous: int) -> int:
"""Increments count up to ``sys.maxsize``."""
return (
count_previous + count if sys.maxsize - count_previous > count else sys.maxsize
)
[docs]
def is_acronym(word: str, contain_digits: bool = False) -> bool:
"""Checks if the word is all caps (acronym) and/or contain numbers.
Args:
word: The word to check
contain_digits: A flag to determine whether any term with digits can be
considered as acronym
Returns:
True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C,
abc12, ab12c. False if the word contains lower case letters, e.g.,
abcde, ABCde, abcDE, abCDe.
"""
return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or (
contain_digits and any(i.isdigit() for i in word)
)
[docs]
@_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
def null_distance_results(
string_1: Optional[str], string_2: Optional[str], max_distance: int
) -> int:
"""Determines the proper return value of an edit distance function when one
or both strings are null.
Args:
string_1: Base string.
string_2: The string to compare.
max_distance: The maximum distance allowed.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings are
equivalent (both are None), otherwise a positive number whose
magnitude is the length of the string which is not None.
"""
if string_1 is None:
if string_2 is None:
return 0
return len(string_2) if len(string_2) <= max_distance else -1
return len(string_1) if len(string_1) <= max_distance else -1
[docs]
def parse_words(
phrase: str, preserve_case: bool = False, split_by_space: bool = False
) -> list[str]:
"""Creates a non-unique wordlist from sample text. Language independent
(e.g. works with Chinese characters)
Args:
phrase: Sample text that could contain one or more words.
preserve_case: A flag to determine if we can to preserve the cases or
convert all to lowercase.
split_by_space: Splits the phrase into words simply based on space.
Returns:
A list of words
"""
if split_by_space:
if preserve_case:
return phrase.split()
return phrase.lower().split()
# \W non-words, use negated set to ignore non-words and "_" (underscore).
# Compatible with non-latin characters, does not split words at apostrophes
if preserve_case:
return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())
[docs]
@_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
def prefix_suffix_prep(string_1: str, string_2: str) -> tuple[int, int, int]:
"""Calculates starting position and lengths of two strings such that common
prefix and suffix substrings are excluded.
Expects len(string_1) <= len(string_2).
Args:
string_1: Base string.
string_2: The string to compare.
Returns:
A tuple of lengths of the part excluding common prefix and suffix, and
the starting position.
"""
# this is also the minimun length of the two strings
len_1 = len(string_1)
len_2 = len(string_2)
# suffix common to both strings can be ignored
while len_1 != 0 and string_1[len_1 - 1] == string_2[len_2 - 1]:
len_1 -= 1
len_2 -= 1
# prefix common to both strings can be ignored
start = 0
while start != len_1 and string_1[start] == string_2[start]:
start += 1
if start != 0:
len_1 -= start
# length of the part excluding common prefix and suffix
len_2 -= start
return len_1, len_2, start
[docs]
def to_similarity(distance: int, length: int) -> float:
"""Calculates a similarity measure from an edit distance.
Args:
distance: The edit distance between two strings.
length: The length of the longer of the two strings the edit distance is
from.
Returns:
A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if
distance is negative
"""
return -1 if distance < 0 else 1.0 - distance / length
[docs]
def try_parse_int64(string: str) -> Optional[int]:
"""Converts the string representation of a number to its 64-bit signed
integer equivalent.
Args:
string: String representation of a number.
Returns:
The 64-bit signed integer equivalent, or None if conversion failed or if
the number is less than the min value or greater than the max value
of a 64-bit signed integer.
"""
try:
ret = int(string)
except ValueError:
return None
return ret if -(2**63) <= ret <= 2**63 - 1 else None
[docs]
class DictIO:
"""An iterator wrapper for python dictionary to format the output as required
by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`.
Args:
dictionary: dictionary with words as keys and frequency count as values.
separator: Separator characters between term(s) and count.
Attributes:
iteritems: An iterator object of dictionary.items().
separator: Separator characters between term(s) and count.
"""
def __init__(self, dictionary: dict[str, int], separator: str = " ") -> None:
self.iteritems = iter(dictionary.items())
self.separator = separator
def __iter__(self) -> "DictIO":
return self
def __next__(self) -> str:
return self.separator.join(map(str, next(self.iteritems)))