diff --git a/README.md b/README.md index f9fd038..063750b 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ vectorizer = KeyphraseCountVectorizer() # Print parameters print(vectorizer.get_params()) ->>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'multiprocessing': False, 'pos_pattern': '*+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english'} +>>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is @@ -237,7 +237,7 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) -{'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'multiprocessing': False, 'norm': 'l2', 'pos_pattern': '*+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True} +{'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'norm': 'l2', 'pos_pattern': '*+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True, 'workers': 1} ``` To calculate tf values instead, set `use_idf=False`. diff --git a/docs/requirements.txt b/docs/requirements.txt index f0e1009..94a80da 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -15,4 +15,5 @@ numpy>=1.18.5 spacy>=3.0.1 nltk>=3.6.1 scikit-learn>=1.0 -scipy>=1.7.3 \ No newline at end of file +scipy>=1.7.3 +psutil>=5.8.0 \ No newline at end of file diff --git a/keyphrase_vectorizers/_version.py b/keyphrase_vectorizers/_version.py index eead319..fa9c4ec 100644 --- a/keyphrase_vectorizers/_version.py +++ b/keyphrase_vectorizers/_version.py @@ -1 +1 @@ -__version__ = '0.0.5' +__version__ = '0.0.6' diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py index 7d4834f..2089ec9 100644 --- a/keyphrase_vectorizers/keyphrase_count_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_count_vectorizer.py @@ -10,6 +10,7 @@ from typing import List import numpy as np +import psutil from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import CountVectorizer @@ -52,9 +53,10 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): lowercase : bool, default=True Whether the returned keyphrases should be converted to lowercase. - multiprocessing : bool, default=False - Whether to use multiprocessing for spaCy part-of-speech tagging. - If True, spaCy uses all cores to tag documents with part-of-speech. + workers :int, default=1 + How many workers to use for spaCy part-of-speech tagging. + If set to -1, use all available worker threads of the machine. + spaCy uses the specified number of cores to tag documents with part-of-speech. Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. Therefore, carefully consider whether this option is really necessary. @@ -75,7 +77,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): """ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '*+', - stop_words: str = 'english', lowercase: bool = True, multiprocessing: bool = False, max_df: int = None, + stop_words: str = 'english', lowercase: bool = True, workers: int = 1, max_df: int = None, min_df: int = None, binary: bool = False, dtype: np.dtype = np.int64): @@ -108,11 +110,22 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '< "'max_df' must be > 'min_df'" ) + # triggers a parameter validation + if not isinstance(workers, int): + raise ValueError( + "'workers' parameter must be of type int" + ) + + if (workers < -1) or (workers > psutil.cpu_count(logical=True)): + raise ValueError( + "'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True)) + ) + self.spacy_pipeline = spacy_pipeline self.pos_pattern = pos_pattern self.stop_words = stop_words self.lowercase = lowercase - self.multiprocessing = multiprocessing + self.workers = workers self.max_df = max_df self.min_df = min_df self.binary = binary @@ -137,7 +150,7 @@ def fit(self, raw_documents: List[str]) -> object: stop_words=self.stop_words, spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, - lowercase=self.lowercase, multiprocessing=self.multiprocessing) + lowercase=self.lowercase, workers=self.workers) # remove keyphrases that have more than 8 words, as they are probably no real keyphrases # additionally this prevents memory issues during transformation to a document-keyphrase matrix diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py index 2d094a2..41621b1 100644 --- a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py +++ b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py @@ -10,6 +10,7 @@ from typing import List import numpy as np +import psutil from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import TfidfTransformer from sklearn.utils.validation import FLOAT_DTYPES @@ -80,9 +81,10 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): lowercase : bool, default=True Whether the returned keyphrases should be converted to lowercase. - multiprocessing : bool, default=False - Whether to use multiprocessing for spaCy part-of-speech tagging. - If True, spaCy uses all cores to tag documents with part-of-speech. + workers :int, default=1 + How many workers to use for spaCy part-of-speech tagging. + If set to -1, use all available worker threads of the machine. + spaCy uses the specified number of cores to tag documents with part-of-speech. Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. Therefore, carefully consider whether this option is really necessary. @@ -121,17 +123,28 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '*+', stop_words: str = 'english', - lowercase: bool = True, multiprocessing: bool = False, max_df: int = None, min_df: int = None, + lowercase: bool = True, workers: int = 1, max_df: int = None, min_df: int = None, binary: bool = False, dtype: np.dtype = np.float64, norm: str = "l2", use_idf: bool = True, smooth_idf: bool = True, sublinear_tf: bool = False): + # triggers a parameter validation + if not isinstance(workers, int): + raise ValueError( + "'workers' parameter must be of type int" + ) + + if (workers < -1) or (workers > psutil.cpu_count(logical=True)): + raise ValueError( + "'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True)) + ) + self.spacy_pipeline = spacy_pipeline self.pos_pattern = pos_pattern self.stop_words = stop_words self.lowercase = lowercase - self.multiprocessing = multiprocessing + self.workers = workers self.max_df = max_df self.min_df = min_df self.binary = binary @@ -145,7 +158,7 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '< sublinear_tf=self.sublinear_tf) super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words, - lowercase=self.lowercase, multiprocessing=self.multiprocessing, max_df=self.max_df, + lowercase=self.lowercase, workers=self.workers, max_df=self.max_df, min_df=self.min_df, binary=self.binary, dtype=self.dtype) diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py index 2bdef9e..d111070 100644 --- a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py +++ b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py @@ -10,6 +10,7 @@ from typing import List import numpy as np +import psutil import scipy.sparse as sp import spacy from nltk import RegexpParser @@ -180,7 +181,7 @@ def _split_long_document(self, text: str, max_text_length: int) -> List[str]: return splitted_document def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_pipeline: str, pos_pattern: str, - lowercase: bool = True, multiprocessing: bool = False) -> List[str]: + lowercase: bool = True, workers: int = 1) -> List[str]: """ Select keyphrases with part-of-speech tagging from a text document. Parameters @@ -202,9 +203,10 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p lowercase : bool, default=True Whether the returned keyphrases should be converted to lowercase. - multiprocessing : bool, default=False - Whether to use multiprocessing for spaCy POS tagging. - If True, spaCy uses all cores to POS tag documents. + workers :int, default=1 + How many workers to use for spaCy part-of-speech tagging. + If set to -1, use all available worker threads of the machine. + spaCy uses the specified number of cores to tag documents with part-of-speech. Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. Therefore, carefully consider whether this option is really necessary. @@ -244,6 +246,17 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p "'pos_pattern' parameter needs to be a regex string. E.g. '*+'" ) + # triggers a parameter validation + if not isinstance(workers, int): + raise ValueError( + "'workers' parameter must be of type int" + ) + + if (workers < -1) or (workers > psutil.cpu_count(logical=True)): + raise ValueError( + "'workers' parameter value must be between -1 and " + str(psutil.cpu_count(logical=True)) + ) + stop_words_list = [] if stop_words: stop_words_list = set(stopwords.words(stop_words)) @@ -274,11 +287,8 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p nlp.add_pipe('sentencizer') keyphrases_list = [] - if multiprocessing: - num_workers = -1 + if workers != 1: os.environ["TOKENIZERS_PARALLELISM"] = "false" - else: - num_workers = 1 # split large documents in smaller chunks, so that spacy can process them without memory issues docs_list = [] @@ -297,7 +307,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_p nlp.max_length = max([len(doc) for doc in document_list]) + 100 cp = RegexpParser('CHUNK: {(' + pos_pattern + ')}') - for tagged_doc in nlp.pipe(document_list, n_process=num_workers): + for tagged_doc in nlp.pipe(document_list, n_process=workers): tagged_pos_doc = [] for sentence in tagged_doc.sents: pos_tagged_sentence = [] diff --git a/requirements.txt b/requirements.txt index 3b91726..14aaf6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy>=1.18.5 spacy>=3.0.1 nltk>=3.6.1 scikit-learn>=1.0 -scipy>=1.7.3 \ No newline at end of file +scipy>=1.7.3 +psutil>=5.8.0 \ No newline at end of file diff --git a/setup.py b/setup.py index fb2edaf..1eba0d3 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,8 @@ 'spacy >= 3.0.1', 'nltk >= 3.6.1', 'scikit-learn >= 1.0', - 'scipy>=1.7.3' + 'scipy>=1.7.3', + 'psutil>=5.8.0' ], package_dir={"": "."}, packages=setuptools.find_packages(where="."),