Skip to content

Commit

Permalink
v0.0.4, increased efficiency of spaCy pipeline for POS tagging + adde…
Browse files Browse the repository at this point in the history
…d multiprocessing option

Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
  • Loading branch information
TimSchopf committed Feb 3, 2022
1 parent 8a0003d commit 591d71d
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 112 deletions.
2 changes: 1 addition & 1 deletion keyphrase_vectorizers/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.3'
__version__ = '0.0.4'
33 changes: 21 additions & 12 deletions keyphrase_vectorizers/keyphrase_count_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy part-of-speech tagging.
If True, spaCy uses all cores to tag documents with part-of-speech.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
binary : bool, default=False
If True, all non zero counts are set to 1.
This is useful for discrete probabilistic models that model binary events rather than integer counts.
Expand All @@ -61,12 +68,14 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
"""

def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
stop_words: str = 'english', lowercase: bool = True, binary: bool = False, dtype: np.dtype = np.int64):
stop_words: str = 'english', lowercase: bool = True, multiprocessing: bool = False,
binary: bool = False, dtype: np.dtype = np.int64):

self.spacy_pipeline = spacy_pipeline
self.pos_pattern = pos_pattern
self.stop_words = stop_words
self.lowercase = lowercase
self.multiprocessing = multiprocessing
self.binary = binary
self.dtype = dtype

Expand All @@ -85,11 +94,11 @@ def fit(self, raw_documents: List[str]) -> object:
Fitted vectorizer.
"""

self.keyphrases = self._get_pos_keyphrases_of_multiple_docs(document_list=raw_documents,
stop_words=self.stop_words,
spacy_pipeline=self.spacy_pipeline,
pos_pattern=self.pos_pattern,
lowercase=self.lowercase)
self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
stop_words=self.stop_words,
spacy_pipeline=self.spacy_pipeline,
pos_pattern=self.pos_pattern,
lowercase=self.lowercase, multiprocessing=self.multiprocessing)

# set n-gram range to zero if no keyphrases could be extracted
if self.keyphrases:
Expand Down Expand Up @@ -118,11 +127,11 @@ def fit_transform(self, raw_documents: List[str]) -> List[List[int]]:
Document-keyphrase matrix.
"""

self.keyphrases = self._get_pos_keyphrases_of_multiple_docs(document_list=raw_documents,
stop_words=self.stop_words,
spacy_pipeline=self.spacy_pipeline,
pos_pattern=self.pos_pattern,
lowercase=self.lowercase)
self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
stop_words=self.stop_words,
spacy_pipeline=self.spacy_pipeline,
pos_pattern=self.pos_pattern,
lowercase=self.lowercase, multiprocessing=self.multiprocessing)

# set n-gram range to zero if no keyphrases could be extracted
if self.keyphrases:
Expand Down Expand Up @@ -212,7 +221,7 @@ def get_feature_names(self) -> List[str]:
except AttributeError:
raise DeprecationWarning("get_feature_names() is deprecated. Please use 'get_feature_names_out()' instead.")

def get_feature_names_out(self) -> List[str]:
def get_feature_names_out(self) -> np.array(str):
"""
Get fitted keyphrases for transformation.
Expand Down
14 changes: 12 additions & 2 deletions keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy part-of-speech tagging.
If True, spaCy uses all cores to tag documents with part-of-speech.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
binary : bool, default=False
If True, all non zero counts are set to 1.
This is useful for discrete probabilistic models that model binary events rather than integer counts.
Expand Down Expand Up @@ -107,14 +114,16 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):

def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
stop_words: str = 'english',
lowercase: bool = True, binary: bool = False, dtype: np.dtype = np.float64, norm: str = "l2",
lowercase: bool = True, multiprocessing: bool = False, binary: bool = False,
dtype: np.dtype = np.float64, norm: str = "l2",
use_idf: bool = True, smooth_idf: bool = True,
sublinear_tf: bool = False):

self.spacy_pipeline = spacy_pipeline
self.pos_pattern = pos_pattern
self.stop_words = stop_words
self.lowercase = lowercase
self.multiprocessing = multiprocessing
self.binary = binary
self.dtype = dtype
self.norm = norm
Expand All @@ -126,7 +135,8 @@ def __init__(self, spacy_pipeline: str = 'en_core_web_sm', pos_pattern: str = '<
sublinear_tf=self.sublinear_tf)

super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words,
lowercase=self.lowercase, binary=self.binary, dtype=self.dtype)
lowercase=self.lowercase, multiprocessing=self.multiprocessing, binary=self.binary,
dtype=self.dtype)

def _check_params(self):
"""
Expand Down
172 changes: 75 additions & 97 deletions keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import logging
import os
from typing import List

import spacy
Expand Down Expand Up @@ -64,15 +65,14 @@ def _remove_prefixes(self, text: str, prefixes: List[str]) -> str:
return text[len(prefix):].strip()
return text

def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: str, pos_pattern: str,
lowercase: bool = True) -> List[str]:
def _get_pos_keyphrases(self, document_list: List[str], stop_words: str, spacy_pipeline: str, pos_pattern: str,
lowercase: bool = True, multiprocessing: bool = False) -> List[str]:
"""
Select keyphrases with part-of-speech tagging from a text document.
Parameters
----------
document : str
Text document from which to extract the keyphrases.
document_list : list of str
List of text documents from which to extract the keyphrases.
stop_words : str
Language of stopwords to remove from the document, e.g.'english.
Expand All @@ -88,15 +88,28 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
multiprocessing : bool, default=False
Whether to use multiprocessing for spaCy POS tagging.
If True, spaCy uses all cores to POS tag documents.
Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
Therefore, carefully consider whether this option is really necessary.
Returns
-------
keyphrases : List of unique keyphrases of varying length, extracted from the text document with the defined 'pos_pattern'.
"""

# triggers a parameter validation
if not isinstance(document, str):
if isinstance(document_list, str):
raise ValueError(
"Given document is not a string."
"Iterable over raw text documents expected, string object received."
)

# triggers a parameter validation
if not hasattr(document_list, '__iter__'):
raise ValueError(
"Iterable over raw text documents expected."
)

# triggers a parameter validation
Expand All @@ -121,9 +134,11 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
if stop_words:
stop_words_list = set(stopwords.words(stop_words))

# add spaCy POS tags for document
# add spaCy POS tags for documents
try:
nlp = spacy.load(spacy_pipeline)
nlp = spacy.load(spacy_pipeline,
exclude=['ner', 'entity_linker', 'entity_ruler', 'textcat', 'textcat_multilabel',
'lemmatizer', 'morphologizer', 'senter', 'sentencizer', 'transformer'])
except OSError:
# set logger
logger = logging.getLogger('KeyphraseVectorizer')
Expand All @@ -136,94 +151,57 @@ def _get_pos_keyphrases(self, document: str, stop_words: str, spacy_pipeline: st
logger.info(
'It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.')
spacy.cli.download(spacy_pipeline)
nlp = spacy.load(spacy_pipeline)
nlp = spacy.load(spacy_pipeline,
exclude=['ner', 'entity_linker', 'entity_ruler', 'textcat', 'textcat_multilabel',
'lemmatizer', 'morphologizer', 'senter', 'sentencizer', 'transformer'])

tagged_doc = nlp(document)
tagged_pos_doc = []
for sentence in tagged_doc.sents:
pos_tagged_sentence = []
for word in sentence:
pos_tagged_sentence.append((word.text, word.tag_))
tagged_pos_doc.append(pos_tagged_sentence)
keyphrases_list = []
if multiprocessing:
num_workers = -1
os.environ["TOKENIZERS_PARALLELISM"] = "false"
else:
num_workers = 1

# extract keyphrases that match the NLTK RegexpParser filter
cp = RegexpParser('CHUNK: {(' + pos_pattern + ')}')
keyphrases = []
prefix_list = [stop_word + ' ' for stop_word in stop_words_list]
suffix_list = [' ' + stop_word for stop_word in stop_words_list]
for sentence in tagged_pos_doc:
tree = cp.parse(sentence)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
# join candidate keyphrase from single words
keyphrase = ' '.join([i[0] for i in subtree.leaves()])

# convert keyphrase to lowercase
if lowercase:
keyphrase = keyphrase.lower()

# remove stopword suffixes
keyphrase = self._remove_suffixes(keyphrase, suffix_list)

# remove stopword prefixes
keyphrase = self._remove_prefixes(keyphrase, prefix_list)

# remove whitespace from the beginning and end of keyphrases
keyphrase = keyphrase.strip()

# do not include single keywords that are actually stopwords
if keyphrase.lower() not in stop_words_list:
keyphrases.append(keyphrase)

# remove potential empty keyphrases
keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != '']

return list(set(keyphrases))

def _get_pos_keyphrases_of_multiple_docs(self, document_list: List[str], stop_words: str, spacy_pipeline: str,
pos_pattern: str, lowercase: bool = True) -> List[str]:
"""
Select keyphrases with part-of-speech tagging from a list of text documents.
Parameters
----------
document_list : list of str
List of text documents from which to extract the keyphrases.
stop_words : str
Language of stopwords to remove from the document, e.g.'english.
Supported options are `stopwords available in NLTK`_.
Removes unwanted stopwords from keyphrases if 'stop_words' is not None.
spacy_pipeline : str
The name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text.
pos_pattern : str
The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text.
lowercase : bool, default=True
Whether the returned keyphrases should be converted to lowercase.
Returns
-------
keyphrases : List of unique keyphrases of varying length, extracted from the given text documents with the given 'pos_pattern'.
"""

# triggers a parameter validation
if isinstance(document_list, str):
raise ValueError(
"Iterable over raw text documents expected, string object received."
)

# triggers a parameter validation
if not hasattr(document_list, '__iter__'):
raise ValueError(
"Iterable over raw text documents expected."
)

keyphrases = [
self._get_pos_keyphrases(document=doc, stop_words=stop_words, spacy_pipeline=spacy_pipeline,
pos_pattern=pos_pattern, lowercase=lowercase) for doc in document_list]
keyphrases = [keyphrase for sub_keyphrase_list in keyphrases for keyphrase in
sub_keyphrase_list]
return list(set(keyphrases))
for tagged_doc in nlp.pipe(document_list, n_process=num_workers):
tagged_pos_doc = []
for sentence in tagged_doc.sents:
pos_tagged_sentence = []
for word in sentence:
pos_tagged_sentence.append((word.text, word.tag_))
tagged_pos_doc.append(pos_tagged_sentence)

# extract keyphrases that match the NLTK RegexpParser filter
keyphrases = []
prefix_list = [stop_word + ' ' for stop_word in stop_words_list]
suffix_list = [' ' + stop_word for stop_word in stop_words_list]
for sentence in tagged_pos_doc:
tree = cp.parse(sentence)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
# join candidate keyphrase from single words
keyphrase = ' '.join([i[0] for i in subtree.leaves()])

# convert keyphrase to lowercase
if lowercase:
keyphrase = keyphrase.lower()

# remove stopword suffixes
keyphrase = self._remove_suffixes(keyphrase, suffix_list)

# remove stopword prefixes
keyphrase = self._remove_prefixes(keyphrase, prefix_list)

# remove whitespace from the beginning and end of keyphrases
keyphrase = keyphrase.strip()

# do not include single keywords that are actually stopwords
if keyphrase.lower() not in stop_words_list:
keyphrases.append(keyphrase)

# remove potential empty keyphrases
keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != '']

keyphrases_list.append(list(set(keyphrases)))

return list(set([keyphrase for sub_keyphrase_list in keyphrases_list for keyphrase in sub_keyphrase_list]))

0 comments on commit 591d71d

Please sign in to comment.