Source code for symspellpy.helpers

# MIT License
#
# Copyright (c) 2022 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

"""
.. module:: helpers
   :synopsis: Helper functions
"""

import re
import sys
from difflib import SequenceMatcher
from typing import Dict, List, Optional, Tuple


[docs]def case_transfer_matching(cased_text: str, uncased_text: str) -> str: """Transfers the casing from one text to another - assuming that they are 'matching' texts, alias they have the same length. Args: cased_text: Text with varied casing. uncased_text: Text that is in lowercase only. Returns: Text with the content of `uncased_text` and the casing of `cased_text`. Raises: ValueError: If the input texts have different lengths. """ if len(cased_text) != len(uncased_text): raise ValueError( "'cased_text' and 'uncased_text' don't have the same length, use " "case_transfer_similar() instead" ) return "".join( [ y.upper() if x.isupper() else y.lower() for x, y in zip(cased_text, uncased_text) ] )
[docs]def case_transfer_similar(cased_text: str, uncased_text: str) -> str: """Transfers the casing from one text to another - for similar (not matching) text. Use `difflib.SequenceMatcher` to identify the different type of changes needed to turn `cased_text` into `uncased_text`. - For inserted sections: transfer the casing from the prior character. If no character before or the character before is the space, transfer the casing from the following character. - For deleted sections: no case transfer is required. - For equal sections: swap out the text with the original, the cased one, a otherwise the two are the same. - For replaced sections: transfer the casing using :meth:`case_transfer_matching` if the two has the same length, otherwise transfer character-by-character and carry the last casing over to any additional characters. Args: cased_text: Text with varied casing. uncased_text: Text in lowercase. Returns: Text with the content of `uncased_text` but the casing of `cased_text`. Raises: ValueError: If `cased_text` is empty. """ if not uncased_text: return uncased_text if not cased_text: raise ValueError("'cased_text' cannot be empty") matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text) result = "" for tag, ia1, ia2, ib1, ib2 in matcher.get_opcodes(): if tag == "delete": continue if tag == "insert": # For the first character or space on the left, take the casing from # the following character. Else take case the prior character ia_ref = ia1 if ia1 == 0 or cased_text[ia1 - 1] == " " else ia1 - 1 if cased_text[ia_ref].isupper(): result += uncased_text[ib1:ib2].upper() else: result += uncased_text[ib1:ib2].lower() elif tag == "equal": # Transfer the text from the cased_text, as anyhow they are equal # (without the casing) result += cased_text[ia1:ia2] else: cased_seq = cased_text[ia1:ia2] uncased_seq = uncased_text[ib1:ib2] if len(cased_seq) == len(uncased_seq): result += case_transfer_matching(cased_seq, uncased_seq) else: # transfer the casing character-by-character and using the last # casing to continue if we run out of the sequence for cased, uncased in zip(cased_seq, uncased_seq): result += uncased.upper() if cased.isupper() else uncased.lower() # Apply casing from the last character of cased_seq to the rest # of the uncased_seq if len(cased_seq) < len(uncased_seq): upper = cased_seq[-1].isupper() idx = len(cased_seq) result += "".join( map(str.upper if upper else str.lower, uncased_seq[idx:]) ) return result
[docs]def increment_count(count: int, count_previous: int) -> int: """Increments count up to ``sys.maxsize``.""" return ( count_previous + count if sys.maxsize - count_previous > count else sys.maxsize )
[docs]def is_acronym(word: str, contain_digits: bool = False) -> bool: """Checks if the word is all caps (acronym) and/or contain numbers. Args: word: The word to check contain_digits: A flag to determine whether any term with digits can be considered as acronym Returns: True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C, abc12, ab12c. False if the word contains lower case letters, e.g., abcde, ABCde, abcDE, abCDe. """ return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or ( contain_digits and any(i.isdigit() for i in word) )
[docs]def null_distance_results(string1: str, string2: str, max_distance: int) -> int: """Determines the proper return value of an edit distance function when one or both strings are null. Args: string_1: Base string. string_2: The string to compare. max_distance: The maximum distance allowed. Returns: -1 if the distance is greater than the max_distance, 0 if the strings are equivalent (both are None), otherwise a positive number whose magnitude is the length of the string which is not None. """ if string1 is None: if string2 is None: return 0 return len(string2) if len(string2) <= max_distance else -1 return len(string1) if len(string1) <= max_distance else -1
[docs]def parse_words( phrase: str, preserve_case: bool = False, split_by_space: bool = False ) -> List[str]: """Creates a non-unique wordlist from sample text. Language independent (e.g. works with Chinese characters) Args: phrase: Sample text that could contain one or more words. preserve_case: A flag to determine if we can to preserve the cases or convert all to lowercase. split_by_space: Splits the phrase into words simply based on space. Returns: A list of words """ if split_by_space: if preserve_case: return phrase.split() return phrase.lower().split() # \W non-words, use negated set to ignore non-words and "_" (underscore). # Compatible with non-latin characters, does not split words at apostrophes if preserve_case: return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase) return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())
[docs]def prefix_suffix_prep(string1: str, string2: str) -> Tuple[int, int, int]: """Calculates starting position and lengths of two strings such that common prefix and suffix substrings are excluded. Expects len(string1) <= len(string2). Args: string_1: Base string. string_2: The string to compare. Returns: A tuple of lengths of the part excluding common prefix and suffix, and the starting position. """ # this is also the minimun length of the two strings len1 = len(string1) len2 = len(string2) # suffix common to both strings can be ignored while len1 != 0 and string1[len1 - 1] == string2[len2 - 1]: len1 -= 1 len2 -= 1 # prefix common to both strings can be ignored start = 0 while start != len1 and string1[start] == string2[start]: start += 1 if start != 0: len1 -= start # length of the part excluding common prefix and suffix len2 -= start return len1, len2, start
[docs]def to_similarity(distance: int, length: int) -> float: """Calculates a similarity measure from an edit distance. Args: distance: The edit distance between two strings. length: The length of the longer of the two strings the edit distance is from. Returns: A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if distance is negative """ return -1 if distance < 0 else 1.0 - distance / length
[docs]def try_parse_int64(string: str) -> Optional[int]: """Converts the string representation of a number to its 64-bit signed integer equivalent. Args: string: String representation of a number. Returns: The 64-bit signed integer equivalent, or None if conversion failed or if the number is less than the min value or greater than the max value of a 64-bit signed integer. """ try: ret = int(string) except ValueError: return None return ret if -(2 ** 63) <= ret <= 2 ** 63 - 1 else None
[docs]class DictIO: """An iterator wrapper for python dictionary to format the output as required by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`. Args: dictionary: Dictionary with words as keys and frequency count as values. separator: Separator characters between term(s) and count. Attributes: iteritems: An iterator object of dictionary.items(). separator: Separator characters between term(s) and count. """ def __init__(self, dictionary: Dict[str, int], separator: str = " ") -> None: self.iteritems = iter(dictionary.items()) self.separator = separator def __iter__(self) -> "DictIO": return self def __next__(self) -> str: return self.separator.join(map(str, next(self.iteritems)))