Source code for symspellpy.pickle_mixin

# MIT License
#
# Copyright (c) 2022 mmb L (Python port)
# Copyright (c) 2021 Wolf Garbe (Original C# implementation)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

"""
.. module:: pickle_mixing
   :synopsis: Mixin to provide pickle loading and saving functionalities.
"""

import gzip
import logging
import pickle
from operator import itemgetter
from pathlib import Path
from typing import IO, Dict, List, Optional, Union, cast

logger = logging.getLogger(__name__)


# Protocol only available in py38
# class SymSpellProtocol(Protocol):
#     data_version: int
#     _count_threshold: int
#     _max_dictionary_edit_distance: int
#     _prefix_length: int
#     _deletes: Dict[str, List[str]]
#     _words: Dict[str, int]
#     _max_length: int


[docs]class PickleMixin: """Implements saving and loading pickle functionality for SymSpell.""" data_version: int _below_threshold_words: Dict[str, int] _bigrams: Dict[str, int] _deletes: Dict[str, List[str]] _words: Dict[str, int] _count_threshold: int _max_dictionary_edit_distance: int _max_length: int _prefix_length: int
[docs] def load_pickle( self, data: Union[bytes, Path], compressed: bool = True, from_bytes: bool = False, ) -> bool: """Loads delete combination from file as pickle. This will reduce the loading time compared to running :meth:`load_dictionary` again. Args: data: Either bytes string to be used with ``from_bytes=True`` or the path+filename of the pickle file to be used with ``from_bytes=False``. compressed: A flag to determine whether to read the pickled data as compressed data. from_bytes: Flag to determine if we are loading from bytes or file. Returns: ``True`` if delete combinations are successfully loaded. """ if from_bytes: assert isinstance(data, bytes) return self._load_pickle_stream(data, from_bytes) if compressed: with gzip.open(data, "rb") as gzip_infile: return self._load_pickle_stream(cast(IO[bytes], gzip_infile)) else: with open(data, "rb") as infile: return self._load_pickle_stream(infile)
[docs] def save_pickle( self, filename: Optional[Path] = None, compressed: bool = True, to_bytes: bool = False, ) -> Optional[bytes]: """Pickles :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into a stream for quicker loading later. Args: filename: The path+filename of the pickle file. compressed: A flag to determine whether to compress the pickled data. to_bytes: Flag to determine by bytes string should be returned instead of wrting to file. Returns: A byte string of the pickled data if ``to_bytes=True``. """ if to_bytes: return self._save_pickle_stream(to_bytes=to_bytes) assert filename is not None if compressed: with gzip.open(filename, "wb") as gzip_outfile: self._save_pickle_stream(cast(IO[bytes], gzip_outfile)) else: with open(filename, "wb") as outfile: self._save_pickle_stream(outfile) return None
[docs] def _load_pickle_stream( self, stream: Union[bytes, IO[bytes]], from_bytes: bool = False ) -> bool: """Loads delete combination from stream as pickle. This will reduce the loading time compared to running :meth:`load_dictionary` again. **NOTE**: Prints warning if the current settings `count_threshold`, `max_dictionary_edit_distance`, and `prefix_length` are different from the loaded settings. Overwrite current settings with loaded settings. Args: stream: The stream from which the pickle data is loaded. from_bytes: Flag to determine if we are loading from bytes or file. Returns: ``True`` if delete combinations are successfully loaded. """ if from_bytes: assert isinstance(stream, bytes) pickle_data = pickle.loads(stream) # nosec else: assert not isinstance(stream, bytes) pickle_data = pickle.load(stream) # nosec if pickle_data.get("data_version", None) != self.data_version: return False settings = ("count_threshold", "max_dictionary_edit_distance", "prefix_length") if itemgetter(*settings)(pickle_data) != ( self._count_threshold, self._max_dictionary_edit_distance, self._prefix_length, ): logger.warning( f"Loading data which was created using different {settings} settings. " "Overwriting current SymSpell instance with loaded settings ..." ) self._deletes = pickle_data["deletes"] self._words = pickle_data["words"] self._max_length = pickle_data["max_length"] # Dictionary entries related variables self._below_threshold_words = pickle_data["below_threshold_words"] self._bigrams = pickle_data["bigrams"] self._deletes = pickle_data["deletes"] self._words = pickle_data["words"] self._max_length = pickle_data["max_length"] # SymSpell settings used to generate the above self._count_threshold = pickle_data["count_threshold"] self._max_dictionary_edit_distance = pickle_data["max_dictionary_edit_distance"] self._prefix_length = pickle_data["prefix_length"] return True
[docs] def _save_pickle_stream( self, stream: Optional[IO[bytes]] = None, to_bytes=False ) -> Optional[bytes]: """Pickles :attr:`_below_threshold_words`, :attr:`_bigrams`, :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into a stream for quicker loading later. Pickles :attr:`_count_threshold`, :attr:`_max_dictionary_edit_distance`, and :attr:`_prefix_length` to ensure consistent behavior. Args: stream: The stream to store the pickle data. to_bytes: Flag to determine by bytes string should be returned instead of wrting to file. Returns: A byte string of the pickled data if ``to_bytes=True``. """ pickle_data = { # Dictionary entries related variables "below_threshold_words": self._below_threshold_words, "bigrams": self._bigrams, "deletes": self._deletes, "words": self._words, "max_length": self._max_length, # SymSpell settings used to generate the above "count_threshold": self._count_threshold, "max_dictionary_edit_distance": self._max_dictionary_edit_distance, "prefix_length": self._prefix_length, # Version to ensure compatibility "data_version": self.data_version, } if to_bytes: return pickle.dumps(pickle_data) assert stream is not None pickle.dump(pickle_data, stream) return None