You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When trying to run the code for preparing Glove word-embeddings matrix, towards the end of the notebook for chapter 11, part 2, I get an error:
embedding_dim = 100
Retrieve the vocabulary indexed by our previous TextVectorization layer.
vocabulary = text_vectorization.get_vocabulary()
Use it to create a mapping from words to their index in the vocabulary.
Prepare a matrix that will be filled with the GloVe vectors.
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
if i < max_tokens:
embedding_vector = embeddings_index.get(word)
# Fill entry i in the matrix with the word vector for index i.
# Words not found in the embedding index will be all zeros.
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
UnicodeDecodeError Traceback (most recent call last)
Input In [29], in <cell line: 4>()
1 embedding_dim = 100
3 # Retrieve the vocabulary indexed by our previous TextVectorization layer.
----> 4 vocabulary = text_vectorization.get_vocabulary()
5 # Use it to create a mapping from words to their index in the vocabulary.
6 word_index = dict(zip(vocabulary, range(len(vocabulary))))
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\text_vectorization.py:448, in TextVectorization.get_vocabulary(self, include_special_tokens)
439 def get_vocabulary(self, include_special_tokens=True):
440 """Returns the current vocabulary of the layer.
441
442 Args:
(...)
446 vocabulary will not include any padding or OOV tokens.
447 """
--> 448 return self._lookup_layer.get_vocabulary(include_special_tokens)
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\index_lookup.py:336, in IndexLookup.get_vocabulary(self, include_special_tokens)
334 keys, values = self.lookup_table.export()
335 vocab, indices = (values, keys) if self.invert else (keys, values)
--> 336 vocab, indices = (self._tensor_vocab_to_numpy(vocab), indices.numpy())
337 lookup = collections.defaultdict(lambda: self.oov_token,
338 zip(indices, vocab))
339 vocab = [lookup[x] for x in range(self.vocabulary_size())]
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\string_lookup.py:401, in StringLookup._tensor_vocab_to_numpy(self, vocabulary)
399 def _tensor_vocab_to_numpy(self, vocabulary):
400 vocabulary = vocabulary.numpy()
--> 401 return np.array([tf.compat.as_text(x, self.encoding) for x in vocabulary])
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\string_lookup.py:401, in (.0)
399 def _tensor_vocab_to_numpy(self, vocabulary):
400 vocabulary = vocabulary.numpy()
--> 401 return np.array([tf.compat.as_text(x, self.encoding) for x in vocabulary])
When trying to run the code for preparing Glove word-embeddings matrix, towards the end of the notebook for chapter 11, part 2, I get an error:
embedding_dim = 100
Retrieve the vocabulary indexed by our previous TextVectorization layer.
vocabulary = text_vectorization.get_vocabulary()
Use it to create a mapping from words to their index in the vocabulary.
word_index = dict(zip(vocabulary, range(len(vocabulary))))
Prepare a matrix that will be filled with the GloVe vectors.
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
if i < max_tokens:
embedding_vector = embeddings_index.get(word)
# Fill entry i in the matrix with the word vector for index i.
# Words not found in the embedding index will be all zeros.
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
UnicodeDecodeError Traceback (most recent call last)
Input In [29], in <cell line: 4>()
1 embedding_dim = 100
3 # Retrieve the vocabulary indexed by our previous TextVectorization layer.
----> 4 vocabulary = text_vectorization.get_vocabulary()
5 # Use it to create a mapping from words to their index in the vocabulary.
6 word_index = dict(zip(vocabulary, range(len(vocabulary))))
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\text_vectorization.py:448, in TextVectorization.get_vocabulary(self, include_special_tokens)
439 def get_vocabulary(self, include_special_tokens=True):
440 """Returns the current vocabulary of the layer.
441
442 Args:
(...)
446 vocabulary will not include any padding or OOV tokens.
447 """
--> 448 return self._lookup_layer.get_vocabulary(include_special_tokens)
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\index_lookup.py:336, in IndexLookup.get_vocabulary(self, include_special_tokens)
334 keys, values = self.lookup_table.export()
335 vocab, indices = (values, keys) if self.invert else (keys, values)
--> 336 vocab, indices = (self._tensor_vocab_to_numpy(vocab), indices.numpy())
337 lookup = collections.defaultdict(lambda: self.oov_token,
338 zip(indices, vocab))
339 vocab = [lookup[x] for x in range(self.vocabulary_size())]
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\string_lookup.py:401, in StringLookup._tensor_vocab_to_numpy(self, vocabulary)
399 def _tensor_vocab_to_numpy(self, vocabulary):
400 vocabulary = vocabulary.numpy()
--> 401 return np.array([tf.compat.as_text(x, self.encoding) for x in vocabulary])
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\preprocessing\string_lookup.py:401, in (.0)
399 def _tensor_vocab_to_numpy(self, vocabulary):
400 vocabulary = vocabulary.numpy()
--> 401 return np.array([tf.compat.as_text(x, self.encoding) for x in vocabulary])
File C:\ProgramData\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\util\compat.py:110, in as_text(bytes_or_text, encoding)
108 return bytes_or_text
109 elif isinstance(bytes_or_text, bytes):
--> 110 return bytes_or_text.decode(encoding)
111 else:
112 raise TypeError('Expected binary or unicode string, got %r' % bytes_or_text)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data
The text was updated successfully, but these errors were encountered: