Skip to content

Optimization: borrow texts#5

Open
vody-am wants to merge 1 commit intoraphaelsty:mainfrom
vody-am:main
Open

Optimization: borrow texts#5
vody-am wants to merge 1 commit intoraphaelsty:mainfrom
vody-am:main

Conversation

@vody-am
Copy link
Copy Markdown

@vody-am vody-am commented Apr 5, 2026

Hi,

I have made a (very) small change to the signature of the private method _fit for the SparseMatrixBuilder to be a borrow instead of move. This appears to result in a significant performance improvement to text vectorization (~20%) from my measurements. If you have the time, please double check, but it seems correct to me. I have included a small plot and script if you want to try reproducing the timings.

OPTIMIZATIONS

Timings:

Before:
 10000 docs – LeNLP: 0.1056s, sklearn: 0.2025s
 50000 docs – LeNLP: 0.5402s, sklearn: 1.0127s
100000 docs – LeNLP: 1.0883s, sklearn: 2.0256s
250000 docs – LeNLP: 2.7803s, sklearn: 5.0870s
500000 docs – LeNLP: 5.7044s, sklearn: 10.1802s
750000 docs – LeNLP: 9.0268s, sklearn: 15.2542s
1000000 docs – LeNLP: 13.0205s, sklearn: 20.1161s

After:
 10000 docs – LeNLP: 0.0814s, sklearn: 0.2025s
 50000 docs – LeNLP: 0.4312s, sklearn: 1.0097s
100000 docs – LeNLP: 0.8738s, sklearn: 2.0351s
250000 docs – LeNLP: 2.2482s, sklearn: 5.0698s
500000 docs – LeNLP: 4.4222s, sklearn: 10.0993s
750000 docs – LeNLP: 6.7196s, sklearn: 15.6493s
1000000 docs – LeNLP: 9.1809s, sklearn: 20.3156s

script:

#!/usr/bin/env python
"""
Benchmark LeNLP TfidfVectorizer vs scikit-learn TfidfVectorizer.

Generates synthetic documents of varying sizes and measures runtime for
fit_transform operation.
"""

import time
import random
random.seed(0)
import matplotlib.pyplot as plt

from lenlp import sparse as le_sparse
from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer


def generate_documents(n_docs: int, n_words: int = 100, vocab_size: int = 1000) -> list[str]:
    """Generate a list of random documents.

    Each document consists of ``n_words`` random tokens drawn from a vocabulary
    of size ``vocab_size``.
    """
    vocab = [f"word{i}" for i in range(vocab_size)]
    return [" ".join(random.choices(vocab, k=n_words)) for _ in range(n_docs)]


def benchmark(vectorizer_factory, docs: list[str], n_repeat: int = 3) -> float:
    """Run ``fit_transform`` ``n_repeat`` times and return the average elapsed seconds."""
    times = []
    for _ in range(n_repeat):
        vec = vectorizer_factory()
        start = time.perf_counter()
        vec.fit_transform(docs)
        times.append(time.perf_counter() - start)
    return sum(times) / len(times)


def main():
    # Document counts to test
    doc_counts = [10_000, 50_000, 100_000, 250_000, 500_000, 750_000, 1_000_000]

    le_times = []
    sk_times = []

    for n in doc_counts:
        docs = generate_documents(n)
        le_time = benchmark(
            lambda: le_sparse.TfidfVectorizer(analyzer="word", ngram_range=(1, 1), normalize=True),
            docs,
        )
        sk_time = benchmark(
            lambda: SklearnTfidfVectorizer(analyzer="word", ngram_range=(1, 1), norm="l2", use_idf=True),
            docs,
        )
        le_times.append(le_time)
        sk_times.append(sk_time)
        print(f"{n:6d} docs – LeNLP: {le_time:.4f}s, sklearn: {sk_time:.4f}s")

    # Plot results
    plt.figure(figsize=(8, 5))
    plt.plot(doc_counts, le_times, marker="o", label="LeNLP TfidfVectorizer")
    plt.plot(doc_counts, sk_times, marker="o", label="scikit-learn TfidfVectorizer")
    plt.xlabel("Number of documents")
    plt.ylabel("Runtime (seconds)")
    plt.title("LeNLP vs scikit-learn TF-IDF fit_transform benchmark")
    plt.legend()
    plt.grid(True, which="both", ls="--", alpha=0.5)
    plt.tight_layout()
    plt.savefig("docs/tfidf_benchmark.png")
    plt.show()


if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant