Skip to content

API Reference

count_tokens

count_tokens_in_file(file_path, encoding_name='cl100k_base', approximate=None, tokens_per_word=TOKENS_PER_WORD, characters_per_token=CHARACTERS_PER_TOKEN)

Return the number of tokens in a text file.

Parameters:

Name Type Description Default
file_path str

The path to the text file to count the tokens in.

required
encoding_name str

The name of the encoding to use. Default: cl100k_base

'cl100k_base'
approximate str | None

Approximate the number of tokens without tokenizing. Base on: w - words, c - characters

None
tokens_per_word float

The number of tokens per word for word-based approximation. Default: 4/3

TOKENS_PER_WORD
characters_per_token float

The number of characters per token for character-based approximation. Default: 4

CHARACTERS_PER_TOKEN

Returns:

Type Description
int

The number of tokens in the text file.

Source code in src/count_tokens/count.py
def count_tokens_in_file(
    file_path: str,
    encoding_name: str = "cl100k_base",
    approximate: str | None = None,
    tokens_per_word: float = TOKENS_PER_WORD,
    characters_per_token: float = CHARACTERS_PER_TOKEN,
) -> int:
    """Return the number of tokens in a text file.

    Args:
        file_path: The path to the text file to count the tokens in.
        encoding_name: The name of the encoding to use. Default: cl100k_base
        approximate: Approximate the number of tokens without tokenizing. Base on: w - words, c - characters
        tokens_per_word: The number of tokens per word for word-based approximation. Default: 4/3
        characters_per_token: The number of characters per token for character-based approximation. Default: 4

    Returns:
        The number of tokens in the text file.
    """
    text = pathlib.Path(file_path).read_text()
    if approximate is None:
        return count_tokens_in_string(text, encoding_name)
    elif approximate == "w":
        return int(len(text.split()) * tokens_per_word)
    elif approximate == "c":
        return int(len(text) / characters_per_token)
    return count_tokens_in_string(text, encoding_name)

count_tokens_in_string(string, encoding_name='cl100k_base')

Return the number of tokens in a text string.

Parameters:

Name Type Description Default
string str

The text string to count the tokens in.

required
encoding_name str

The name of the encoding to use. Default: cl100k_base

'cl100k_base'

Returns:

Type Description
int

The number of tokens in the text string.

Source code in src/count_tokens/count.py
def count_tokens_in_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Return the number of tokens in a text string.

    Args:
        string: The text string to count the tokens in.
        encoding_name: The name of the encoding to use. Default: cl100k_base

    Returns:
        The number of tokens in the text string.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(string))