Source code for word_vectors

"""Read, Write, and Convert between different word vector serialization formats."""

__version__ = "4.0.0"

from typing import Dict, Tuple
from enum import Enum
import numpy as np


#: A mapping of word to integer index. This index is used pull the this words
#: vector from the matrix of word vectors.
Vocab = Dict[str, int]
#: The actual word vectors. These are always of rank 2 and have the shape ``[vocab size, vector size]``
Vectors = np.ndarray


[docs]class FileType(Enum):
    """An Enumeration of the Word Vector file types supported."""

    #: The format used by Glove. See :py:func:`~word_vectors.read.read_glove` for a
    #: description of file format and common pre-trained embeddings that use this format.
    GLOVE = "glove"
    #: The text format introduced by Word2Vec. See :py:func:`~word_vectors.read.read_w2v_text`
    #: for a description of the file format and common pre-trained embeddings that use this format.
    W2V_TEXT = "w2v-text"
    #: The binary format used by Word2Vec and pre-trained GoogleNews vectors. See
    #: :py:func:`~word_vectors.read.read_w2v` for a description of the file format and common
    #: pre-trained embeddings that use this format.
    W2V = "w2v"
    #: Our new Leader file format. See :py:func:`~word_vectors.read.read_leader` for a description of the file format.
    LEADER = "leader"
    #: The file format used to distribute FastText vectors, it is just the word2vec text format.
    #: See :py:func:`~word_vectors.read.read_w2v_text` for a description of the file format.
    FASTTEXT = "w2v-text"
    #: The file format used to distribute Numberbatch vectors, it is just the word2vec text format.
    #: See :py:func:`~word_vectors.read.read_w2v_text` for a description of the file format.
    NUMBERBATCH = "w2v-text"

[docs]    @classmethod
    def from_string(cls, value: str) -> "FileType":
        """Convert a string into the Enum value.

        Args:
            value: The string specifying the file type.

        Returns:
            The Enum value parsed from the string.

        Raises:
            ValueError: If the string wasn't able to be parsed into
                an Enum value.
        """
        value = value.lower()
        if value == "glove":
            return cls.GLOVE
        if value == "w2v_text" or value == "w2v-text":
            return cls.W2V_TEXT
        if value == "w2v":
            return cls.W2V
        if value == "leader":
            return cls.LEADER
        if value == "numberbatch":
            return cls.NUMBERBATCH
        if value in ("fasttext", "fast-text", "fast_text"):
            return cls.FASTTEXT
        raise ValueError(f"Unable to understand file type, got: {value}")

    def __str__(self) -> str:
        """When calling ``str`` on an enum member output a value suitable for filenames"""
        return self.value


INT_SIZE = 4  #: The size of an int32 in bytes used when reading binary files.
FLOAT_SIZE = 4  #: The size of a float32 in bytes when reading a binary file.
LONG_SIZE = 8  #: The size of an int64 in bytes when reading binary files.
LEADER_HEADER = 3  #: The number of elements in the Leader format header.
LEADER_MAGIC_NUMBER = 38941  #: A magic number used to identify a Leader format file.


import word_vectors.read as read_module
import word_vectors.write as write_module
import word_vectors.convert as convert_module
from word_vectors.read import (
    read,
    read_with_vocab,
    read_w2v,
    read_w2v_with_vocab,
    read_w2v_text,
    read_w2v_with_vocab,
    read_glove,
    read_glove_with_vocab,
    read_leader,
    read_leader_with_vocab,
    verify_leader,
)
from word_vectors.convert import (
    convert,
    w2v_to_leader,
    w2v_to_glove,
    w2v_to_w2v_text,
    glove_to_leader,
    glove_to_w2v,
    glove_to_w2v_text,
    w2v_text_to_leader,
    w2v_text_to_w2v,
    w2v_text_to_glove,
    leader_to_glove,
    leader_to_w2v,
    leader_to_w2v_text,
)
from word_vectors.write import write, write_w2v, write_w2v_text, write_glove, write_leader