Source code for word_vectors.utils

"""Utilities for working with word vector I/O."""

import os
import pathlib
from contextlib import contextmanager
from typing import Tuple, Iterable, Union, BinaryIO, IO, Callable
import numpy as np
from file_or_name import file_or_name
from word_vectors import Vocab, FileType


# The characters we define as "non-binary" when guessing if a file is binary.
ASCII_CHARACTERS = b"".join(map(lambda x: bytes((x,)), range(32, 127))) + b"\n\r\t\f\b"



[docs]
def find_space(buf: bytes, offset: int) -> Tuple[str, int]:
    """Find the first space starting from offset and return word that spans the spaces and the new offset.

    Args:
        buf: The bytes buffer we are looking for a space in.
        offset: Where in the buffer we start looking.

    Returns:
        A (word, offset) tuple where word is the text (decoded from ``utf-8``) starting at
        the original offset until the first space. Offset is index of the location just
        after the space we just found.
    """
    i = offset + 1
    while buf[i : i + 1] != b" ":
        i += 1
    word = buf[offset:i].decode("utf-8")
    return word, i + 1




[docs]
@file_or_name(f="rb")
def is_binary(
    f: Union[str, BinaryIO], block_size: int = 512, ratio: float = 0.30, text_characters: bytes = ASCII_CHARACTERS
) -> bool:
    """Guess if a file is binary or not.

    This is based on the implementation from `here`_

    .. _here: https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python

    Args:
        f: The file we are testing.
        block_size: The amount of the file to read in for checking.
        ratio: How many non-ascii characters before we assume it is binary.
        text_characters: Characters that we define as text characters, the ratio of these characters
            to others is used to determine if the file was binary or not.

    Returns:
        True if the file is binary, False otherwise
    """
    # Because we are operating on an open file object we need to reset where we read from in case
    # people are going to start reading from it right away.
    with bookmark(f):
        block = f.read(block_size)
    # If there are null bytes then it must be binary
    if b"\x00" in block:
        return True
    # We are defining an empty file as a text file.
    elif not block:
        return False

    # Delete all the characters from `text_characters` to leave only the non_text ones
    non_text = block.translate(None, text_characters)
    # If there are more than ratio non-text characters we are a binary file.
    return len(non_text) / len(block) > ratio




[docs]
@contextmanager
def bookmark(f: IO):
    """Bookmark where we are in a file so we can return.

    This is a context manager that lets us save our spot in an open file,
    to some operations on that file, and then return to the original stop.

    This is very useful for things like sniffing a file. If the file is
    already open and you read in some bytes to estimate the format you need
    to remember to reset to the start or else you will get wrong results.
    This context manager automates this. ::

        f.tell()
        >>> 120
        with bookmark(f):
            _ = f.read(1024)
            print(f.tell())
        >>> 1144
        f.tell()
        >>> 120

    Args:
        f: The file we are bookmarking.
    """
    start = f.tell()
    yield
    f.seek(start)




[docs]
def to_vocab(words: Iterable[str]) -> Vocab:
    """Convert a series of words to a vocab mapping strings to ints.

    Args:
        words: The words in the vocab

    Returns:
        The Vocabulary
    """
    return {w: i for i, w in enumerate(words)}




[docs]
def create_output_path(path: Union[str, IO, pathlib.PurePath], file_type: FileType) -> str:
    """Create the output path by stripping the extension and added a new one based on the vector format.

    Args:
        path: The path to the input file.
        file_type: The vector format we are converting to.

    Returns:
        The new output path with an extension determined by the file type.
    """
    if isinstance(path, (str, pathlib.PurePath)):
        path = str(path)
    else:
        path = path.name
    base, _ = os.path.splitext(path)
    return f"{base}.{file_type}"




[docs]
def uniform_initializer(unif: float) -> Callable[[int], np.ndarray]:
    """Create a vector initialization function that takes a vector size as input.

    Args:
        unif: The bounds that the new vector will be initialized within

    Returns:
        A function that returns a uniformly random vector between ``-unif`` and ``unif``,
    """

    def _unif_initializer(vector_size: int) -> np.ndarray:
        return np.random.uniform(-unif, unif, size=(vector_size,))

    return _unif_initializer