Source code for ocr_stringdist.levenshtein

from typing import Optional

from ._rust_stringdist import *  # noqa: F403
from .default_ocr_distances import ocr_distance_map


[docs] def weighted_levenshtein_distance( s1: str, s2: str, /, substitution_costs: Optional[dict[tuple[str, str], float]] = None, insertion_costs: Optional[dict[str, float]] = None, deletion_costs: Optional[dict[str, float]] = None, *, symmetric_substitution: bool = True, default_substitution_cost: float = 1.0, default_insertion_cost: float = 1.0, default_deletion_cost: float = 1.0, ) -> float: """ Levenshtein distance with custom substitution, insertion and deletion costs. The default `substitution_costs` considers common OCR errors, see :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`. :param s1: First string (interpreted as the string read via OCR) :param s2: Second string :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their substitution costs. Only one direction needs to be configured unless `symmetric_substitution` is False. Note that the runtime scales in the length of the longest substitution token. Defaults to `ocr_stringdist.ocr_distance_map`. :param insertion_costs: Dictionary mapping strings to their insertion costs. :param deletion_costs: Dictionary mapping strings to their deletion costs. :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be symmetric? Defaults to True. :param default_substitution_cost: The default substitution cost for character pairs not found in `substitution_costs`. :param default_insertion_cost: The default insertion cost for characters not found in `insertion_costs`. :param default_deletion_cost: The default deletion cost for characters not found in `deletion_costs`. """ if substitution_costs is None: substitution_costs = ocr_distance_map if insertion_costs is None: insertion_costs = {} if deletion_costs is None: deletion_costs = {} # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs. return _weighted_levenshtein_distance( # type: ignore # noqa: F405 s1, s2, substitution_costs=substitution_costs, insertion_costs=insertion_costs, deletion_costs=deletion_costs, symmetric_substitution=symmetric_substitution, default_substitution_cost=default_substitution_cost, default_insertion_cost=default_insertion_cost, default_deletion_cost=default_deletion_cost, )
[docs] def batch_weighted_levenshtein_distance( s: str, candidates: list[str], /, substitution_costs: Optional[dict[tuple[str, str], float]] = None, insertion_costs: Optional[dict[str, float]] = None, deletion_costs: Optional[dict[str, float]] = None, *, symmetric_substitution: bool = True, default_substitution_cost: float = 1.0, default_insertion_cost: float = 1.0, default_deletion_cost: float = 1.0, ) -> list[float]: """ Calculate weighted Levenshtein distances between a string and multiple candidates. This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times. :param s: The string to compare (interpreted as the string read via OCR) :param candidates: List of candidate strings to compare against :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their substitution costs. Only one direction needs to be configured unless `symmetric_substitution` is False. Note that the runtime scales in the length of the longest substitution token. Defaults to `ocr_stringdist.ocr_distance_map`. :param insertion_costs: Dictionary mapping strings to their insertion costs. :param deletion_costs: Dictionary mapping strings to their deletion costs. :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be symmetric? Defaults to True. :param default_substitution_cost: The default substitution cost for character pairs not found in `substitution_costs`. :param default_insertion_cost: The default insertion cost for characters not found in `insertion_costs`. :param default_deletion_cost: The default deletion cost for characters not found in `deletion_costs`. :return: A list of distances corresponding to each candidate """ if substitution_costs is None: substitution_costs = ocr_distance_map if insertion_costs is None: insertion_costs = {} if deletion_costs is None: deletion_costs = {} # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs. return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405 s, candidates, substitution_costs=substitution_costs, insertion_costs=insertion_costs, deletion_costs=deletion_costs, symmetric_substitution=symmetric_substitution, default_substitution_cost=default_substitution_cost, default_insertion_cost=default_insertion_cost, default_deletion_cost=default_deletion_cost, )