Source code for word_vectors

"""Read, Write, and Convert between different word vector serialization formats."""

__version__ = "4.0.2"

from typing import Dict, Tuple
from enum import Enum
import numpy as np


#: A mapping of word to integer index. This index is used pull the this words
#: vector from the matrix of word vectors.
Vocab = Dict[str, int]
#: The actual word vectors. These are always of rank 2 and have the shape ``[vocab size, vector size]``
Vectors = np.ndarray


[docs] class FileType(Enum): """An Enumeration of the Word Vector file types supported.""" #: The format used by Glove. See :py:func:`~word_vectors.read.read_glove` for a #: description of file format and common pre-trained embeddings that use this format. GLOVE = "glove" #: The text format introduced by Word2Vec. See :py:func:`~word_vectors.read.read_w2v_text` #: for a description of the file format and common pre-trained embeddings that use this format. W2V_TEXT = "w2v-text" #: The binary format used by Word2Vec and pre-trained GoogleNews vectors. See #: :py:func:`~word_vectors.read.read_w2v` for a description of the file format and common #: pre-trained embeddings that use this format. W2V = "w2v" #: Our new Leader file format. See :py:func:`~word_vectors.read.read_leader` for a description of the file format. LEADER = "leader" #: The file format used to distribute FastText vectors, it is just the word2vec text format. #: See :py:func:`~word_vectors.read.read_w2v_text` for a description of the file format. FASTTEXT = "w2v-text" #: The file format used to distribute Numberbatch vectors, it is just the word2vec text format. #: See :py:func:`~word_vectors.read.read_w2v_text` for a description of the file format. NUMBERBATCH = "w2v-text"
[docs] @classmethod def from_string(cls, value: str) -> "FileType": """Convert a string into the Enum value. Args: value: The string specifying the file type. Returns: The Enum value parsed from the string. Raises: ValueError: If the string wasn't able to be parsed into an Enum value. """ value = value.lower() if value == "glove": return cls.GLOVE if value == "w2v_text" or value == "w2v-text": return cls.W2V_TEXT if value == "w2v": return cls.W2V if value == "leader": return cls.LEADER if value == "numberbatch": return cls.NUMBERBATCH if value in ("fasttext", "fast-text", "fast_text"): return cls.FASTTEXT raise ValueError(f"Unable to understand file type, got: {value}")
def __str__(self) -> str: """When calling ``str`` on an enum member output a value suitable for filenames""" return self.value
INT_SIZE = 4 #: The size of an int32 in bytes used when reading binary files. FLOAT_SIZE = 4 #: The size of a float32 in bytes when reading a binary file. LONG_SIZE = 8 #: The size of an int64 in bytes when reading binary files. LEADER_HEADER = 3 #: The number of elements in the Leader format header. LEADER_MAGIC_NUMBER = 38941 #: A magic number used to identify a Leader format file. import word_vectors.read as read_module import word_vectors.write as write_module import word_vectors.convert as convert_module from word_vectors.read import ( read, read_with_vocab, read_w2v, read_w2v_with_vocab, read_w2v_text, read_w2v_with_vocab, read_glove, read_glove_with_vocab, read_leader, read_leader_with_vocab, verify_leader, ) from word_vectors.convert import ( convert, w2v_to_leader, w2v_to_glove, w2v_to_w2v_text, glove_to_leader, glove_to_w2v, glove_to_w2v_text, w2v_text_to_leader, w2v_text_to_w2v, w2v_text_to_glove, leader_to_glove, leader_to_w2v, leader_to_w2v_text, ) from word_vectors.write import write, write_w2v, write_w2v_text, write_glove, write_leader