tokenizer = tfds.deprecated.text.Tokenizer()
로 바뀌었다. 세상 참 빠르다.
(<tf.Tensor: shape=(), dtype=string, numpy=b'not a cloud to be seen neither on plain nor mountain. These last'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'To win the heart; there Love, there young Desire,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'To parching airs beside the running stream;'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Their people as the pastured flock the ram'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"A vessel's plank is smooth and even laid,">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
b'not a cloud to be seen neither on plain nor mountain. These last'
[213, 12965, 228, 9770, 15265, 11378, 3288, 17101, 5332, 13656, 4080, 8818, 14602]
Epoch 1/3
2020-08-11 23:43:42.133082: I tensorflow/stream_executor/platform/default/] Successfully opened dynamic library cublas64_10.dll
2020-08-11 23:43:52.484863: I tensorflow/core/kernels/data/] Filling up shuffle buffer (this may take a while): 35287 of 50000
2020-08-11 23:43:54.470819: I tensorflow/core/kernels/data/] Shuffle buffer filled.
2020-08-11 23:43:54.500071: I tensorflow/stream_executor/platform/default/] Successfully opened dynamic library cudnn64_7.dll
697/697 [==============================] - 13s 19ms/step - loss: 0.5028 - accuracy: 0.7522 - val_loss: 0.3978 - val_accuracy: 0.8140
Epoch 2/3
2020-08-11 23:44:19.091046: I tensorflow/core/kernels/data/] Filling up shuffle buffer (this may take a while): 35584 of 50000
2020-08-11 23:44:21.195127: I tensorflow/core/kernels/data/] Shuffle buffer filled.
697/697 [==============================] - 12s 18ms/step - loss: 0.2949 - accuracy: 0.8707 - val_loss: 0.4052 - val_accuracy: 0.8206
Epoch 3/3
2020-08-11 23:44:43.657965: I tensorflow/core/kernels/data/] Filling up shuffle buffer (this may take a while): 35537 of 50000
2020-08-11 23:44:45.518081: I tensorflow/core/kernels/data/] Shuffle buffer filled.
697/697 [==============================] - 12s 17ms/step - loss: 0.2191 - accuracy: 0.9055 - val_loss: 0.3737 - val_accuracy: 0.8298
79/79 [==============================] - 2s 20ms/step - loss: 0.3737 - accuracy: 0.8298
Eval loss: 0.374, Eval accuracy: 0.830
import tensorflow as tf
import tensorflow_datasets as tfds
import os
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
for name in FILE_NAMES:
text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL + name)
parent_dir = os.path.dirname(text_dir)
def labeler(example, index):
return example, tf.cast(index, tf.int64)
labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
lines_dataset =, file_name))
labeled_dataset = ex: labeler(ex, i))
TAKE_SIZE = 5000
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
all_labeled_data = all_labeled_data.shuffle(
BUFFER_SIZE, reshuffle_each_iteration=False)
for ex in all_labeled_data.take(5):
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())
vocab_size = len(vocabulary_set)
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
example_text = next(iter(all_labeled_data))[0].numpy()
encoded_example = encoder.encode(example_text)
def encode(text_tensor, label):
encoded_text = encoder.encode(text_tensor.numpy())
return encoded_text, label
def encode_map_fn(text, label):
# py_func doesn't set the shape of the returned tensors.
encoded_text, label = tf.py_function(encode,
inp=[text, label],
Tout=(tf.int64, tf.int64))
# `` work best if all components have a shape set
# so set the shapes manually:
return encoded_text, label
all_encoded_data =
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)
test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]
vocab_size += 1
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
model.add(tf.keras.layers.Dense(units, activation='relu'))
# Output layer. The first argument is the number of labels.
metrics=['accuracy']), epochs=3, validation_data=test_data)
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))
