Source code for word_vectors.convert

"""Convert between word vector formats.

We provide the main :py:func:`~word_vectors.convert.convert` function for converting
between arbitrary formats based on the passed :py:attr:`~word_vectors.FileType` (or
by sniffing the input file with :py:func:`~word_vectors.read.sniff` when not provided)
as well as several convenience function for converting between different pairs of formats.
"""

import logging
from typing import Union, TextIO, BinaryIO, Optional
from word_vectors import FileType
from word_vectors.read import read
from word_vectors.write import write
from word_vectors.utils import create_output_path


LOGGER = logging.getLogger("word_vectors")


# We don't know what mode to open the file in (text for things like Glove while
# binary for things like Word2Vec or Leader) we can't use the `@file_or_name`
# decorator directly but all the functions we call use that so we can handle
# all the file formats.
[docs] def convert( f: Union[str, TextIO, BinaryIO], output: Optional[str] = None, output_file_type: FileType = FileType.LEADER, input_file_type: Optional[FileType] = None, ): """Convert vectors from one format to another. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. output_file_type: The vector serialization format to use when writing out the vectors. input_file_type: An explicit vector format to use when reading. """ LOGGER.info("Reading vectors from %s", f) w, wv = read(f, input_file_type) output = create_output_path(f, output_file_type) if output is None else output LOGGER.info("Writing vectors to %s", output) write(output, w, wv, output_file_type)
[docs] def w2v_to_leader(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert binary Word2Vec formatted vectors to the Leader format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.LEADER, FileType.W2V)
[docs] def glove_to_leader(f: Union[str, TextIO], output: Optional[str] = None): """Convert GloVe formatted vectors to the Leader format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.Leader, FileType.GLOVE)
[docs] def w2v_text_to_leader(f: Union[str, TextIO], output: Optional[str] = None): """Convert text Word2Vec formatted vectors to the Leader format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.Leader, FileType.W2V_TEXT)
[docs] def w2v_to_w2v_text(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert binary Word2Vec formatted vectors to the Binary Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V_TEXT, FileType.W2V)
[docs] def w2v_to_glove(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert binary Word2Vec formatted vectors to the GloVe format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.GLOVE, FileType.W2V)
[docs] def w2v_text_to_glove(f: Union[str, TextIO], output: Optional[str] = None): """Convert text Word2Vec formatted vectors to the Glove format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.GLOVE, FileType.W2V_TEXT)
[docs] def w2v_text_to_w2v(f: Union[str, TextIO], output: Optional[str] = None): """Convert text Word2Vec formatted vectors to the binary Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V, FileType.W2V_TEXT)
[docs] def glove_to_w2v(f: Union[str, TextIO], output: Optional[str] = None): """Convert GloVe formatted vectors to the binary Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V, FileType.GLOVE)
[docs] def glove_to_w2v_text(f: Union[str, TextIO], output: Optional[str] = None): """Convert GloVe formatted vectors to the text Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V_TEXT, FileType.GLOVE)
[docs] def leader_to_w2v(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert Leader formatted vectors to the binary Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V, FileType.LEADER)
[docs] def leader_to_w2v_text(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert Leader formatted vectors to the text Word2Vec format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.W2V_TEXT, FileType.LEADER)
[docs] def leader_to_glove(f: Union[str, BinaryIO], output: Optional[str] = None): """Convert Leader formatted vectors to the GloVe format. Args: f: The file to read from. output: The name for the output file. If not provided we use the input file name with a modified extension. """ convert(f, output, FileType.GLOVE, FileType.LEADER)