# MIT License
#
# Copyright (c) 2022 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
"""
.. module:: editdistance
:synopsis: Module for edit distance algorithms.
"""
from abc import ABC, abstractmethod
from enum import Enum
from typing import List
from editdistpy import damerau_osa, levenshtein
from symspellpy import helpers
[docs]class DistanceAlgorithm(Enum):
"""Supported edit distance algorithms."""
LEVENSHTEIN = 0 #: Levenshtein algorithm.
DAMERAU_OSA = 1 #: Damerau optimal string alignment algorithm
LEVENSHTEIN_FAST = 2 #: Fast Levenshtein algorithm.
DAMERAU_OSA_FAST = 3 #: Fast Damerau optimal string alignment algorithm
[docs]class EditDistance:
"""Edit distance algorithms.
Args:
algorithm: The distance algorithm to use.
Attributes:
_algorithm (:class:`DistanceAlgorithm`): The edit distance algorithm to
use.
_distance_comparer (:class:`AbstractDistanceComparer`): An object to
compute the relative distance between two strings. The concrete
object will be chosen based on the value of :attr:`_algorithm`.
Raises:
ValueError: If `algorithm` specifies an invalid distance algorithm.
"""
def __init__(self, algorithm: DistanceAlgorithm) -> None:
self._distance_comparer: AbstractDistanceComparer
self._algorithm = algorithm
if algorithm == DistanceAlgorithm.LEVENSHTEIN:
self._distance_comparer = Levenshtein()
elif algorithm == DistanceAlgorithm.DAMERAU_OSA:
self._distance_comparer = DamerauOsa()
elif algorithm == DistanceAlgorithm.LEVENSHTEIN_FAST:
self._distance_comparer = LevenshteinFast()
elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST:
self._distance_comparer = DamerauOsaFast()
else:
raise ValueError("unknown distance algorithm")
[docs] def compare(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Compares a string to the base string to determine the edit distance,
using the previously selected algorithm.
Args:
string_1: Base string.
string_2: The string to compare.
max_distance: The maximum distance allowed.
Returns:
The edit distance (or -1 if `max_distance` exceeded).
"""
return self._distance_comparer.distance(string_1, string_2, max_distance)
[docs]class AbstractDistanceComparer(ABC):
"""An interface to compute relative distance between two strings."""
[docs] @abstractmethod
def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Returns a measure of the distance between two strings.
Args:
string_1: One of the strings to compare.
string_2: The other string to compare.
max_distance: The maximum distance that is of interest.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings
are equivalent, otherwise a positive number whose magnitude
increases as difference between the strings increases.
"""
[docs]class Levenshtein(AbstractDistanceComparer):
"""Provides Levenshtein algorithm for computing edit distance metric between
two strings.
Attributes:
_base_char_1_costs (List[int]):
"""
def __init__(self):
self._base_char_1_costs = []
[docs] def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Computes the Levenshtein edit distance between two strings.
Args:
string_1: One of the strings to compare.
string_2: The other string to compare.
max_distance: The maximum distance that is of interest.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings
are equivalent, otherwise a positive number whose magnitude
increases as difference between the strings increases.
"""
if string_1 is None or string_2 is None:
return helpers.null_distance_results(string_1, string_2, max_distance)
if max_distance <= 0:
return 0 if string_1 == string_2 else -1
max_distance = int(min(2 ** 31 - 1, max_distance))
# if strings of different lengths, ensure shorter string is in string_1.
# This can result in a little faster speed by spending more time spinning
# just the inner loop during the main processing.
if len(string_1) > len(string_2):
string_2, string_1 = string_1, string_2
if len(string_2) - len(string_1) > max_distance:
return -1
# identify common suffic and/or prefix that can be ignored
len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
if len_1 == 0:
return len_2 if len_2 <= max_distance else -1
if len_2 > len(self._base_char_1_costs):
self._base_char_1_costs = [0 for _ in range(len_2)]
if max_distance < len_2:
return self._distance_max(
string_1,
string_2,
len_1,
len_2,
start,
max_distance,
self._base_char_1_costs,
)
return self._distance(
string_1, string_2, len_1, len_2, start, self._base_char_1_costs
)
@staticmethod
def _distance(
string_1: str,
string_2: str,
len_1: int,
len_2: int,
start: int,
char_1_costs: List[int],
) -> int:
"""Internal implementation of the core Levenshtein algorithm.
**From**: https://github.com/softwx/SoftWx.Match
"""
char_1_costs = [j + 1 for j in range(len_2)]
current_cost = 0
for i in range(len_1):
left_char_cost = above_char_cost = i
char_1 = string_1[start + i]
for j in range(len_2):
# cost of diagonal (substitution)
current_cost = left_char_cost
left_char_cost = char_1_costs[j]
if string_2[start + j] != char_1:
# substitution if neither of the two conditions below
if above_char_cost < current_cost:
current_cost = above_char_cost
if left_char_cost < current_cost:
current_cost = left_char_cost
current_cost += 1
char_1_costs[j] = above_char_cost = current_cost
return current_cost
@staticmethod
def _distance_max(
string_1: str,
string_2: str,
len_1: int,
len_2: int,
start: int,
max_distance: int,
char_1_costs: List[int],
) -> int:
"""Internal implementation of the core Levenshtein algorithm that accepts
a max_distance.
**From**: https://github.com/softwx/SoftWx.Match
"""
char_1_costs = [
j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
]
len_diff = len_2 - len_1
j_start_offset = max_distance - len_diff
j_start = 0
j_end = max_distance
current_cost = 0
for i in range(len_1):
char_1 = string_1[start + i]
prev_char_1_cost = above_char_cost = i
# no need to look beyond window of lower right diagonal -
# max_distance cells (lower right diag is i - lenDiff) and the upper
# left diagonal + max_distance cells (upper left is i)
j_start += 1 if i > j_start_offset else 0
j_end += 1 if j_end < len_2 else 0
for j in range(j_start, j_end):
# cost of diagonal (substitution)
current_cost = prev_char_1_cost
prev_char_1_cost = char_1_costs[j]
if string_2[start + j] != char_1:
# substitution if neither of the two conditions below
if above_char_cost < current_cost:
current_cost = above_char_cost
if prev_char_1_cost < current_cost:
current_cost = prev_char_1_cost
current_cost += 1
char_1_costs[j] = above_char_cost = current_cost
if char_1_costs[i + len_diff] > max_distance:
return -1
return current_cost if current_cost <= max_distance else -1
[docs]class DamerauOsa(AbstractDistanceComparer):
"""Provides optimized methods for computing Damerau-Levenshtein Optimal
String Alignment (OSA) comparisons between two strings.
Attributes:
_base_char_1_costs (List[int]):
_base_prev_char_1_costs (List[int]):
"""
def __init__(self) -> None:
self._base_char_1_costs: List[int] = []
self._base_prev_char_1_costs: List[int] = []
[docs] def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Computes the Damerau-Levenshtein optimal string alignment edit
distance between two strings.
Args:
string_1: One of the strings to compare.
string_2: The other string to compare.
max_distance: The maximum distance that is of interest.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings
are equivalent, otherwise a positive number whose magnitude
increases as difference between the strings increases.
"""
if string_1 is None or string_2 is None:
return helpers.null_distance_results(string_1, string_2, max_distance)
if max_distance <= 0:
return 0 if string_1 == string_2 else -1
max_distance = int(min(2 ** 31 - 1, max_distance))
# if strings of different lengths, ensure shorter string is in string_1.
# This can result in a little faster speed by spending more time spinning
# just the inner loop during the main processing.
if len(string_1) > len(string_2):
string_2, string_1 = string_1, string_2
if len(string_2) - len(string_1) > max_distance:
return -1
# identify common suffix and/or prefix that can be ignored
len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
if len_1 == 0:
return len_2 if len_2 <= max_distance else -1
if len_2 > len(self._base_char_1_costs):
self._base_char_1_costs = [0 for _ in range(len_2)]
self._base_prev_char_1_costs = [0 for _ in range(len_2)]
if max_distance < len_2:
return self._distance_max(
string_1,
string_2,
len_1,
len_2,
start,
max_distance,
self._base_char_1_costs,
self._base_prev_char_1_costs,
)
return self._distance(
string_1,
string_2,
len_1,
len_2,
start,
self._base_char_1_costs,
self._base_prev_char_1_costs,
)
@staticmethod
def _distance(
string_1: str,
string_2: str,
len_1: int,
len_2: int,
start: int,
char_1_costs: List[int],
prev_char_1_costs: List[int],
) -> int:
"""Internal implementation of the core Damerau-Levenshtein, optimal
string alignment algorithm.
**From**: https://github.com/softwx/SoftWx.Match
"""
char_1_costs = [j + 1 for j in range(len_2)]
char_1 = " "
current_cost = 0
for i in range(len_1):
prev_char_1 = char_1
char_1 = string_1[start + i]
char_2 = " "
left_char_cost = above_char_cost = i
next_trans_cost = 0
for j in range(len_2):
this_trans_cost = next_trans_cost
next_trans_cost = prev_char_1_costs[j]
# cost of diagonal (substitution)
prev_char_1_costs[j] = current_cost = left_char_cost
# left now equals current cost (which will be diagonal
# at next iteration)
left_char_cost = char_1_costs[j]
prev_char_2 = char_2
char_2 = string_2[start + j]
if char_1 != char_2:
# substitution if neither of two conditions below
if above_char_cost < current_cost:
current_cost = above_char_cost
if left_char_cost < current_cost:
current_cost = left_char_cost
current_cost += 1
if (
i != 0
and j != 0
and char_1 == prev_char_2
and prev_char_1 == char_2
and this_trans_cost + 1 < current_cost
):
# transposition
current_cost = this_trans_cost + 1
char_1_costs[j] = above_char_cost = current_cost
return current_cost
@staticmethod
def _distance_max(
string_1: str,
string_2: str,
len_1: int,
len_2: int,
start: int,
max_distance: int,
char_1_costs: List[int],
prev_char_1_costs: List[int],
) -> int:
"""Internal implementation of the core Damerau-Levenshtein, optimal
string alignment algorithm that accepts a max_distance.
**From**: https://github.com/softwx/SoftWx.Match
"""
char_1_costs = [
j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
]
len_diff = len_2 - len_1
j_start_offset = max_distance - len_diff
j_start = 0
j_end = max_distance
char_1 = " "
current_cost = 0
for i in range(len_1):
prev_char_1 = char_1
char_1 = string_1[start + i]
char_2 = " "
left_char_cost = above_char_cost = i
next_trans_cost = 0
# no need to look beyond window of lower right diagonal -
# max_distance cells (lower right diag is i - len_diff) and the upper
# left diagonal + max_distance cells (upper left is i)
j_start += 1 if i > j_start_offset else 0
j_end += 1 if j_end < len_2 else 0
for j in range(j_start, j_end):
this_trans_cost = next_trans_cost
next_trans_cost = prev_char_1_costs[j]
# cost of diagonal (substitution)
prev_char_1_costs[j] = current_cost = left_char_cost
# left now equals current cost (which will be diagonal at next
# iteration)
left_char_cost = char_1_costs[j]
prev_char_2 = char_2
char_2 = string_2[start + j]
if char_1 != char_2:
# substitution if neither of two conditions below
if above_char_cost < current_cost:
current_cost = above_char_cost
if left_char_cost < current_cost:
current_cost = left_char_cost
current_cost += 1
if (
i != 0
and j != 0
and char_1 == prev_char_2
and prev_char_1 == char_2
and this_trans_cost + 1 < current_cost
):
# transposition
current_cost = this_trans_cost + 1
char_1_costs[j] = above_char_cost = current_cost
if char_1_costs[i + len_diff] > max_distance:
return -1
return current_cost if current_cost <= max_distance else -1
[docs]class LevenshteinFast(AbstractDistanceComparer):
"""Provides an interface for computing edit distance metric between two
strings using the fast Levenshtein algorithm.
"""
[docs] def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Computes the Levenshtein edit distance between two strings.
Args:
string_1: One of the strings to compare.
string_2: The other string to compare.
max_distance: The maximum distance that is of interest.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings
are equivalent, otherwise a positive number whose magnitude
increases as difference between the strings increases.
"""
return levenshtein.distance(string_1, string_2, max_distance)
[docs]class DamerauOsaFast(AbstractDistanceComparer):
"""Provides an interface for computing edit distance metric between two
strings using the fast Damerau-Levenshtein Optimal String Alignment (OSA)
algorithm.
"""
[docs] def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
"""Computes the Damerau-Levenshtein optimal string alignment edit
distance between two strings.
Args:
string_1: One of the strings to compare.
string_2: The other string to compare.
max_distance: The maximum distance that is of interest.
Returns:
-1 if the distance is greater than the max_distance, 0 if the strings
are equivalent, otherwise a positive number whose magnitude
increases as difference between the strings increases.
"""
return damerau_osa.distance(string_1, string_2, max_distance)