https://github.com/snowballstem/pystemmer
Raw File
Tip revision: cbe740d525d01ae668ad59354aa75eff2b325f4e authored by Stefano Rivera on 26 November 2023, 19:48:16 UTC
Add Python 3.12 to CI
Tip revision: cbe740d
benchmark.py
#!/usr/bin/env python

# This script runs a simple benchmark of the python stemmer interface.

import timeit

datafiles = ('sampledata/englishvoc.txt', 'sampledata/puttydoc.txt',)
words_lst = [None]

for datafile in datafiles:
    words = []
    for line in open(datafile):
        words.extend(line.split())
    for cache_size in (0, 1, 10000, 30000):
        setup = r"""
import Stemmer
stemmer = Stemmer.Stemmer('en', %d)
words = []
for line in open('%s'):
    words.extend(line.split())
""" % (cache_size, datafile)
        t = timeit.Timer(setup=setup,
                         stmt='stemmer.stemWords(words)')
        for iters in (1, 2, 3, 10):
            times = [time / iters for time in t.repeat(5, iters)]
            print("'%s':words=%d,cacheSize=%d,iters=%d,mintime=%f" %
                  (datafile, len(words), cache_size, iters, min(times)))
back to top