v0.0.4, increased efficiency of spaCy pipeline for POS tagging + adde…

…d multiprocessing option Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
TimSchopf · Feb 3, 2022 · 591d71d · 591d71d
1 parent 8a0003d
commit 591d71d
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 112 deletions.
diff --git a/keyphrase_vectorizers/_version.py b/keyphrase_vectorizers/_version.py
@@ -1 +1 @@
-__version__ = '0.0.3'
+__version__ = '0.0.4'
diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -52,6 +52,13 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
     lowercase : bool, default=True
         Whether the returned keyphrases should be converted to lowercase.
 
+    multiprocessing : bool, default=False
+            Whether to use multiprocessing for spaCy part-of-speech tagging.
+            If True, spaCy uses all cores to tag documents with part-of-speech.
+            Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
+            In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
+            Therefore, carefully consider whether this option is really necessary.
+
     binary : bool, default=False
         If True, all non zero counts are set to 1.
         This is useful for discrete probabilistic models that model binary events rather than integer counts.
@@ -61,12 +68,14 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
     """
 
     def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
-                 stop_words: str = 'english', lowercase: bool = True, binary: bool = False, dtype: np.dtype = np.int64):
+                 stop_words: str = 'english', lowercase: bool = True, multiprocessing: bool = False,
+                 binary: bool = False, dtype: np.dtype = np.int64):
 
         self.spacy_pipeline = spacy_pipeline
         self.pos_pattern = pos_pattern
         self.stop_words = stop_words
         self.lowercase = lowercase
+        self.multiprocessing = multiprocessing
         self.binary = binary
         self.dtype = dtype
 
@@ -85,11 +94,11 @@ def fit(self, raw_documents: List[str]) -> object:
             Fitted vectorizer.
         """
 
-        self.keyphrases = self._get_pos_keyphrases_of_multiple_docs(document_list=raw_documents,
-                                                                    stop_words=self.stop_words,
-                                                                    spacy_pipeline=self.spacy_pipeline,
-                                                                    pos_pattern=self.pos_pattern,
-                                                                    lowercase=self.lowercase)
+        self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
+                                                   stop_words=self.stop_words,
+                                                   spacy_pipeline=self.spacy_pipeline,
+                                                   pos_pattern=self.pos_pattern,
+                                                   lowercase=self.lowercase, multiprocessing=self.multiprocessing)
 
         # set n-gram range to zero if no keyphrases could be extracted
         if self.keyphrases:
@@ -118,11 +127,11 @@ def fit_transform(self, raw_documents: List[str]) -> List[List[int]]:
             Document-keyphrase matrix.
         """
 
-        self.keyphrases = self._get_pos_keyphrases_of_multiple_docs(document_list=raw_documents,
-                                                                    stop_words=self.stop_words,
-                                                                    spacy_pipeline=self.spacy_pipeline,
-                                                                    pos_pattern=self.pos_pattern,
-                                                                    lowercase=self.lowercase)
+        self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
+                                                   stop_words=self.stop_words,
+                                                   spacy_pipeline=self.spacy_pipeline,
+                                                   pos_pattern=self.pos_pattern,
+                                                   lowercase=self.lowercase, multiprocessing=self.multiprocessing)
 
         # set n-gram range to zero if no keyphrases could be extracted
         if self.keyphrases:
@@ -212,7 +221,7 @@ def get_feature_names(self) -> List[str]:
         except AttributeError:
             raise DeprecationWarning("get_feature_names() is deprecated. Please use 'get_feature_names_out()' instead.")
 
-    def get_feature_names_out(self) -> List[str]:
+    def get_feature_names_out(self) -> np.array(str):
         """
         Get fitted keyphrases for transformation.
 

diff --git a/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py b/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
@@ -80,6 +80,13 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
     lowercase : bool, default=True
         Whether the returned keyphrases should be converted to lowercase.
 
+    multiprocessing : bool, default=False
+            Whether to use multiprocessing for spaCy part-of-speech tagging.
+            If True, spaCy uses all cores to tag documents with part-of-speech.
+            Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
+            In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
+            Therefore, carefully consider whether this option is really necessary.
+
     binary : bool, default=False
         If True, all non zero counts are set to 1.
         This is useful for discrete probabilistic models that model binary events rather than integer counts.
@@ -107,14 +114,16 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
 
     def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
                  stop_words: str = 'english',
-                 lowercase: bool = True, binary: bool = False, dtype: np.dtype = np.float64, norm: str = "l2",
+                 lowercase: bool = True, multiprocessing: bool = False, binary: bool = False,
+                 dtype: np.dtype = np.float64, norm: str = "l2",
                  use_idf: bool = True, smooth_idf: bool = True,
                  sublinear_tf: bool = False):
 
         self.spacy_pipeline = spacy_pipeline
         self.pos_pattern = pos_pattern
         self.stop_words = stop_words
         self.lowercase = lowercase
+        self.multiprocessing = multiprocessing
         self.binary = binary
         self.dtype = dtype
         self.norm = norm
@@ -126,7 +135,8 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<
                                        sublinear_tf=self.sublinear_tf)
 
         super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words,
-                         lowercase=self.lowercase, binary=self.binary, dtype=self.dtype)
+                         lowercase=self.lowercase, multiprocessing=self.multiprocessing, binary=self.binary,
+                         dtype=self.dtype)
 
     def _check_params(self):
         """

diff --git a/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py b/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
@@ -6,6 +6,7 @@
 """
 
 import logging
+import os
 from typing import List
 
 import spacy
@@ -64,15 +65,14 @@ def _remove_prefixes(self, text: str, prefixes: List[str]) -> str:
                 return text[len(prefix):].strip()
         return text
 
-    def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: str, pos_pattern: str,
-                            lowercase: bool = True) -> List[str]:
+    def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_pipeline: str, pos_pattern: str,
+                            lowercase: bool = True, multiprocessing: bool = False) -> List[str]:
         """
         Select keyphrases with part-of-speech tagging from a text document.
-
         Parameters
         ----------
-        document :  str
-            Text document from which to extract the keyphrases.
+        document_list : list of str
+            List of text documents from which to extract the keyphrases.
 
         stop_words : str
             Language of stopwords to remove from the document, e.g.'english.
@@ -88,15 +88,28 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
         lowercase : bool, default=True
             Whether the returned keyphrases should be converted to lowercase.
 
+        multiprocessing : bool, default=False
+            Whether to use multiprocessing for spaCy POS tagging.
+            If True, spaCy uses all cores to POS tag documents.
+            Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
+            In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
+            Therefore, carefully consider whether this option is really necessary.
+
         Returns
         -------
         keyphrases : List of unique keyphrases of varying length, extracted from the text document with the defined 'pos_pattern'.
         """
 
         # triggers a parameter validation
-        if not isinstance(document, str):
+        if isinstance(document_list, str):
             raise ValueError(
-                "Given document is not a string."
+                "Iterable over raw text documents expected, string object received."
+            )
+
+        # triggers a parameter validation
+        if not hasattr(document_list, '__iter__'):
+            raise ValueError(
+                "Iterable over raw text documents expected."
             )
 
         # triggers a parameter validation
@@ -121,9 +134,11 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
         if stop_words:
             stop_words_list = set(stopwords.words(stop_words))
 
-        # add spaCy POS tags for document
+        # add spaCy POS tags for documents
         try:
-            nlp = spacy.load(spacy_pipeline)
+            nlp = spacy.load(spacy_pipeline,
+                             exclude=['ner', 'entity_linker', 'entity_ruler', 'textcat', 'textcat_multilabel',
+                                      'lemmatizer', 'morphologizer', 'senter', 'sentencizer', 'transformer'])
         except OSError:
             # set logger
             logger = logging.getLogger('KeyphraseVectorizer')
@@ -136,94 +151,57 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
             logger.info(
                 'It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.')
             spacy.cli.download(spacy_pipeline)
-            nlp = spacy.load(spacy_pipeline)
+            nlp = spacy.load(spacy_pipeline,
+                             exclude=['ner', 'entity_linker', 'entity_ruler', 'textcat', 'textcat_multilabel',
+                                      'lemmatizer', 'morphologizer', 'senter', 'sentencizer', 'transformer'])
 
-        tagged_doc = nlp(document)
-        tagged_pos_doc = []
-        for sentence in tagged_doc.sents:
-            pos_tagged_sentence = []
-            for word in sentence:
-                pos_tagged_sentence.append((word.text, word.tag_))
-            tagged_pos_doc.append(pos_tagged_sentence)
+        keyphrases_list = []
+        if multiprocessing:
+            num_workers = -1
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        else:
+            num_workers = 1
 
-        # extract keyphrases that match the NLTK RegexpParser filter
         cp = RegexpParser('CHUNK: {(' + pos_pattern + ')}')
-        keyphrases = []
-        prefix_list = [stop_word + ' ' for stop_word in stop_words_list]
-        suffix_list = [' ' + stop_word for stop_word in stop_words_list]
-        for sentence in tagged_pos_doc:
-            tree = cp.parse(sentence)
-            for subtree in tree.subtrees():
-                if subtree.label() == 'CHUNK':
-                    # join candidate keyphrase from single words
-                    keyphrase = ' '.join([i[0] for i in subtree.leaves()])
-
-                    # convert keyphrase to lowercase
-                    if lowercase:
-                        keyphrase = keyphrase.lower()
-
-                    # remove stopword suffixes
-                    keyphrase = self._remove_suffixes(keyphrase, suffix_list)
-
-                    # remove stopword prefixes
-                    keyphrase = self._remove_prefixes(keyphrase, prefix_list)
-
-                    # remove whitespace from the beginning and end of keyphrases
-                    keyphrase = keyphrase.strip()
-
-                    # do not include single keywords that are actually stopwords
-                    if keyphrase.lower() not in stop_words_list:
-                        keyphrases.append(keyphrase)
-
-        # remove potential empty keyphrases
-        keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != '']
-
-        return list(set(keyphrases))
-
-    def _get_pos_keyphrases_of_multiple_docs(self, document_list: List[str], stop_words: str, spacy_pipeline: str,
-                                             pos_pattern: str, lowercase: bool = True) -> List[str]:
-        """
-        Select keyphrases with part-of-speech tagging from a list of text documents.
-
-        Parameters
-        ----------
-        document_list : list of str
-            List of text documents from which to extract the keyphrases.
-
-        stop_words : str
-            Language of stopwords to remove from the document, e.g.'english.
-            Supported options are `stopwords available in NLTK`_.
-            Removes unwanted stopwords from keyphrases if 'stop_words' is not None.
-
-        spacy_pipeline : str
-            The name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text.
-
-        pos_pattern : str
-            The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text.
-
-        lowercase : bool, default=True
-            Whether the returned keyphrases should be converted to lowercase.
-
-        Returns
-        -------
-        keyphrases : List of unique keyphrases of varying length, extracted from the given text documents with the given 'pos_pattern'.
-        """
-
-        # triggers a parameter validation
-        if isinstance(document_list, str):
-            raise ValueError(
-                "Iterable over raw text documents expected, string object received."
-            )
-
-        # triggers a parameter validation
-        if not hasattr(document_list, '__iter__'):
-            raise ValueError(
-                "Iterable over raw text documents expected."
-            )
-
-        keyphrases = [
-            self._get_pos_keyphrases(document=doc, stop_words=stop_words, spacy_pipeline=spacy_pipeline,
-                                     pos_pattern=pos_pattern, lowercase=lowercase) for doc in document_list]
-        keyphrases = [keyphrase for sub_keyphrase_list in keyphrases for keyphrase in
-                      sub_keyphrase_list]
-        return list(set(keyphrases))
+        for tagged_doc in nlp.pipe(document_list, n_process=num_workers):
+            tagged_pos_doc = []
+            for sentence in tagged_doc.sents:
+                pos_tagged_sentence = []
+                for word in sentence:
+                    pos_tagged_sentence.append((word.text, word.tag_))
+                tagged_pos_doc.append(pos_tagged_sentence)
+
+            # extract keyphrases that match the NLTK RegexpParser filter
+            keyphrases = []
+            prefix_list = [stop_word + ' ' for stop_word in stop_words_list]
+            suffix_list = [' ' + stop_word for stop_word in stop_words_list]
+            for sentence in tagged_pos_doc:
+                tree = cp.parse(sentence)
+                for subtree in tree.subtrees():
+                    if subtree.label() == 'CHUNK':
+                        # join candidate keyphrase from single words
+                        keyphrase = ' '.join([i[0] for i in subtree.leaves()])
+
+                        # convert keyphrase to lowercase
+                        if lowercase:
+                            keyphrase = keyphrase.lower()
+
+                        # remove stopword suffixes
+                        keyphrase = self._remove_suffixes(keyphrase, suffix_list)
+
+                        # remove stopword prefixes
+                        keyphrase = self._remove_prefixes(keyphrase, prefix_list)
+
+                        # remove whitespace from the beginning and end of keyphrases
+                        keyphrase = keyphrase.strip()
+
+                        # do not include single keywords that are actually stopwords
+                        if keyphrase.lower() not in stop_words_list:
+                            keyphrases.append(keyphrase)
+
+            # remove potential empty keyphrases
+            keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != '']
+
+            keyphrases_list.append(list(set(keyphrases)))
+
+        return list(set([keyphrase for sub_keyphrase_list in keyphrases_list for keyphrase in sub_keyphrase_list]))