Source code for symspellpy.helpers

# MIT License
#
# Copyright (c) 2022 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

"""
.. module:: helpers
   :synopsis: Helper functions
"""

import re
import sys
from difflib import SequenceMatcher
from typing import Dict, List, Optional, Tuple


[docs]def case_transfer_matching(cased_text: str, uncased_text: str) -> str:
    """Transfers the casing from one text to another - assuming that they are
    'matching' texts, alias they have the same length.

    Args:
        cased_text: Text with varied casing.
        uncased_text: Text that is in lowercase only.

    Returns:
        Text with the content of `uncased_text` and the casing of `cased_text`.

    Raises:
        ValueError: If the input texts have different lengths.
    """
    if len(cased_text) != len(uncased_text):
        raise ValueError(
            "'cased_text' and 'uncased_text' don't have the same length, use "
            "case_transfer_similar() instead"
        )

    return "".join(
        [
            y.upper() if x.isupper() else y.lower()
            for x, y in zip(cased_text, uncased_text)
        ]
    )


[docs]def case_transfer_similar(cased_text: str, uncased_text: str) -> str:
    """Transfers the casing from one text to another - for similar (not matching)
    text.

    Use `difflib.SequenceMatcher` to identify the different type of changes
    needed to turn `cased_text` into `uncased_text`.

    - For inserted sections: transfer the casing from the prior character. If no
      character before or the character before is the space, transfer the casing
      from the following character.
    - For deleted sections: no case transfer is required.
    - For equal sections: swap out the text with the original, the cased one, a
      otherwise the two are the same.
    - For replaced sections: transfer the casing using
      :meth:`case_transfer_matching` if the two has the same length, otherwise
      transfer character-by-character and carry the last casing over to any
      additional characters.

    Args:
        cased_text: Text with varied casing.
        uncased_text: Text in lowercase.

    Returns:
        Text with the content of `uncased_text` but the casing of `cased_text`.

    Raises:
        ValueError: If `cased_text` is empty.
    """
    if not uncased_text:
        return uncased_text

    if not cased_text:
        raise ValueError("'cased_text' cannot be empty")

    matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text)
    result = ""

    for tag, ia1, ia2, ib1, ib2 in matcher.get_opcodes():
        if tag == "delete":
            continue
        if tag == "insert":
            # For the first character or space on the left, take the casing from
            # the following character. Else take case the prior character
            ia_ref = ia1 if ia1 == 0 or cased_text[ia1 - 1] == " " else ia1 - 1
            if cased_text[ia_ref].isupper():
                result += uncased_text[ib1:ib2].upper()
            else:
                result += uncased_text[ib1:ib2].lower()
        elif tag == "equal":
            # Transfer the text from the cased_text, as anyhow they are equal
            # (without the casing)
            result += cased_text[ia1:ia2]
        else:
            cased_seq = cased_text[ia1:ia2]
            uncased_seq = uncased_text[ib1:ib2]

            if len(cased_seq) == len(uncased_seq):
                result += case_transfer_matching(cased_seq, uncased_seq)
            else:
                # transfer the casing character-by-character and using the last
                # casing to continue if we run out of the sequence
                for cased, uncased in zip(cased_seq, uncased_seq):
                    result += uncased.upper() if cased.isupper() else uncased.lower()
                # Apply casing from the last character of cased_seq to the rest
                # of the uncased_seq
                if len(cased_seq) < len(uncased_seq):
                    upper = cased_seq[-1].isupper()
                    idx = len(cased_seq)
                    result += "".join(
                        map(str.upper if upper else str.lower, uncased_seq[idx:])
                    )
    return result


[docs]def increment_count(count: int, count_previous: int) -> int:
    """Increments count up to ``sys.maxsize``."""
    return (
        count_previous + count if sys.maxsize - count_previous > count else sys.maxsize
    )


[docs]def is_acronym(word: str, contain_digits: bool = False) -> bool:
    """Checks if the word is all caps (acronym) and/or contain numbers.

    Args:
        word: The word to check
        contain_digits: A flag to determine whether any term with digits can be
            considered as acronym

    Returns:
        True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C,
            abc12, ab12c. False if the word contains lower case letters, e.g.,
            abcde, ABCde, abcDE, abCDe.
    """
    return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or (
        contain_digits and any(i.isdigit() for i in word)
    )


[docs]def null_distance_results(string1: str, string2: str, max_distance: int) -> int:
    """Determines the proper return value of an edit distance function when one
    or both strings are null.

    Args:
        string_1: Base string.
        string_2: The string to compare.
        max_distance: The maximum distance allowed.

    Returns:
        -1 if the distance is greater than the max_distance, 0 if the strings are
            equivalent (both are None), otherwise a positive number whose
            magnitude is the length of the string which is not None.
    """
    if string1 is None:
        if string2 is None:
            return 0
        return len(string2) if len(string2) <= max_distance else -1
    return len(string1) if len(string1) <= max_distance else -1


[docs]def parse_words(
    phrase: str, preserve_case: bool = False, split_by_space: bool = False
) -> List[str]:
    """Creates a non-unique wordlist from sample text. Language independent
    (e.g. works with Chinese characters)

    Args:
        phrase: Sample text that could contain one or more words.
        preserve_case: A flag to determine if we can to preserve the cases or
            convert all to lowercase.
        split_by_space: Splits the phrase into words simply based on space.

    Returns:
        A list of words
    """
    if split_by_space:
        if preserve_case:
            return phrase.split()
        return phrase.lower().split()
    # \W non-words, use negated set to ignore non-words and "_" (underscore).
    # Compatible with non-latin characters, does not split words at apostrophes
    if preserve_case:
        return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
    return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())


[docs]def prefix_suffix_prep(string1: str, string2: str) -> Tuple[int, int, int]:
    """Calculates starting position and lengths of two strings such that common
    prefix and suffix substrings are excluded.
    Expects len(string1) <= len(string2).

    Args:
        string_1: Base string.
        string_2: The string to compare.

    Returns:
        A tuple of lengths of the part excluding common prefix and suffix, and
            the starting position.
    """
    # this is also the minimun length of the two strings
    len1 = len(string1)
    len2 = len(string2)
    # suffix common to both strings can be ignored
    while len1 != 0 and string1[len1 - 1] == string2[len2 - 1]:
        len1 -= 1
        len2 -= 1
    # prefix common to both strings can be ignored
    start = 0
    while start != len1 and string1[start] == string2[start]:
        start += 1
    if start != 0:
        len1 -= start
        # length of the part excluding common prefix and suffix
        len2 -= start
    return len1, len2, start


[docs]def to_similarity(distance: int, length: int) -> float:
    """Calculates a similarity measure from an edit distance.

    Args:
        distance: The edit distance between two strings.
        length: The length of the longer of the two strings the edit distance is
            from.

    Returns:
        A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if
            distance is negative
    """
    return -1 if distance < 0 else 1.0 - distance / length


[docs]def try_parse_int64(string: str) -> Optional[int]:
    """Converts the string representation of a number to its 64-bit signed
    integer equivalent.

    Args:
        string: String representation of a number.

    Returns:
        The 64-bit signed integer equivalent, or None if conversion failed or if
            the number is less than the min value or greater than the max value
            of a 64-bit signed integer.
    """
    try:
        ret = int(string)
    except ValueError:
        return None
    return ret if -(2 ** 63) <= ret <= 2 ** 63 - 1 else None


[docs]class DictIO:
    """An iterator wrapper for python dictionary to format the output as required
    by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`.

    Args:
        dictionary: Dictionary with words as keys and frequency count as values.
        separator: Separator characters between term(s) and count.

    Attributes:
        iteritems: An iterator object of dictionary.items().
        separator: Separator characters between term(s) and count.
    """

    def __init__(self, dictionary: Dict[str, int], separator: str = " ") -> None:
        self.iteritems = iter(dictionary.items())
        self.separator = separator

    def __iter__(self) -> "DictIO":
        return self

    def __next__(self) -> str:
        return self.separator.join(map(str, next(self.iteritems)))