https://github.com/snowballstem/pystemmer
Tip revision: cbe740d525d01ae668ad59354aa75eff2b325f4e authored by Stefano Rivera on 26 November 2023, 19:48:16 UTC
Add Python 3.12 to CI
Add Python 3.12 to CI
Tip revision: cbe740d
setup.py
#!/usr/bin/env python
from setuptools import setup, Command, Extension
import os
long_description = r"""
Stemming algorithms
PyStemmer provides access to efficient algorithms for calculating a
"stemmed" form of a word. This is a form with most of the common
morphological endings removed; hopefully representing a common
linguistic base form. This is most useful in building search engines
and information retrieval software; for example, a search with stemming
enabled should be able to find a document containing "cycling" given the
query "cycles".
PyStemmer provides algorithms for several (mainly European) languages,
by wrapping the libstemmer library from the Snowball project in a Python
module.
It also provides access to the classic Porter stemming algorithm for
English: although this has been superseded by an improved algorithm, the
original algorithm may be of interest to information retrieval
researchers wishing to reproduce results of earlier experiments.
""".strip()
version_str = '2.2.0.1'
# libstemmer_c versions have 3 components but pystemmer versions may have more
# (so we can address a pystemmer-specific issue without having to wait for the
# next libstemmer_c release) so take the first 3 components from version_str.
libstemmer_c_version = '.'.join(version_str.split('.')[0:3])
class LibrarySourceCode:
# Directories in libstemmer which contain libstemmer sources (ie, not
# examples, etc).
LIBRARY_CORE_DIRS = ('src_c', 'runtime', 'libstemmer', 'include')
DEFAULT_URI = 'https://snowballstem.org/dist/libstemmer_c-%s.tar.gz' % \
libstemmer_c_version
DEFAULT_CHECKSUM = \
'b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3'
def __init__(self, directory='libstemmer_c-%s' % libstemmer_c_version):
""" Constructor.
:param str directory: Path to directory where source code should
reside.
:return void:
"""
self._directory = directory
@property
def manifest_file_path(self):
""" Produce a path to the manifest within the source code.
:return str:
"""
return os.path.join(self._directory, 'mkinc_utf8.mak')
@property
def include_directories(self):
""" Return all paths to include during the compilation of extension.
:return list(str):
"""
return [os.path.join(self._directory, 'include')]
def iter_manifest_lines(self):
""" Open the manifest file from disk and yield its contents, line by
line.
:yield str:
"""
with open(self.manifest_file_path) as file:
for line in file:
yield line
def source_code_paths(self):
""" Find paths to source code files.
:return set(str):
"""
paths = set()
for line in self.iter_manifest_lines():
line = line.strip().replace(' \\', '')
directory = os.path.split(line)[0]
if line.endswith('.c') and directory in self.LIBRARY_CORE_DIRS:
paths.add(os.path.join(self._directory, line))
return paths
def is_present_on_disk(self):
""" Is the source code present on disk?
:return bool:
"""
return os.path.exists(self._directory)
def download(self, url=None, checksum=None):
""" Download and extract the source code from the web.
:param str url: Url to the zipped source code.
:param str checksum: Sha256 hash of the archive.
:return void:
"""
from tarballfetcher import download_and_extract_tarball
download_and_extract_tarball(
url or self.DEFAULT_URI,
expected_sha256=checksum or self.DEFAULT_CHECKSUM)
LIBRARY_SOURCE_CODE = LibrarySourceCode()
SYSTEM_LIBSTEMMER = os.environ.get('PYSTEMMER_SYSTEM_LIBSTEMMER', False)
if SYSTEM_LIBSTEMMER:
C_EXTENSION = Extension(
'Stemmer',
['src/Stemmer.pyx'],
libraries=['stemmer'],
)
else:
if not LIBRARY_SOURCE_CODE.is_present_on_disk():
LIBRARY_SOURCE_CODE.download()
C_EXTENSION = Extension(
'Stemmer',
['src/Stemmer.pyx'] + list(LIBRARY_SOURCE_CODE.source_code_paths()),
include_dirs=LIBRARY_SOURCE_CODE.include_directories
)
class BootstrapCommand(Command):
description = 'Download libstemmer_c dependency'
user_options = [
('libstemmer-url=', None, 'path to libstemmer c library'),
('libstemmer-sha256=', None, 'Expected SHA256 for the tarball'),
]
def initialize_options(self):
self.libstemmer_url = LIBRARY_SOURCE_CODE.DEFAULT_URI
self.libstemmer_sha256 = LIBRARY_SOURCE_CODE.DEFAULT_CHECKSUM
def finalize_options(self):
pass
def run(self):
LIBRARY_SOURCE_CODE.download(
self.libstemmer_url, self.libstemmer_sha256)
setup(name='PyStemmer',
version=version_str,
author='Richard Boulton',
author_email='richard@tartarus.org',
maintainer='Richard Boulton',
maintainer_email='richard@tartarus.org',
url='https://github.com/snowballstem/pystemmer/',
description='Snowball stemming algorithms, for information retrieval',
long_description=long_description,
platforms=["any"],
license="MIT, BSD",
keywords=[
"python",
"information retrieval",
"language processing",
"morphological analysis",
"stemming algorithms",
"stemmers"
],
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"License :: OSI Approved :: BSD License",
"Natural Language :: Danish",
"Natural Language :: Dutch",
"Natural Language :: English",
"Natural Language :: Finnish",
"Natural Language :: French",
"Natural Language :: German",
"Natural Language :: Italian",
"Natural Language :: Norwegian",
"Natural Language :: Portuguese",
"Natural Language :: Russian",
"Natural Language :: Spanish",
"Natural Language :: Swedish",
"Operating System :: OS Independent",
"Programming Language :: C",
"Programming Language :: Other",
"Programming Language :: Python",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Database",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Text Processing :: Indexing",
"Topic :: Text Processing :: Linguistic",
],
setup_requires=['Cython>=0.28.5', 'setuptools>=18.0'],
ext_modules=[
C_EXTENSION
],
cmdclass={'bootstrap': BootstrapCommand}
)