From 809160bc663cebfd14b09e7ae3467d2ead43b703 Mon Sep 17 00:00:00 2001 From: niushuaibing Date: Fri, 14 Feb 2025 16:02:33 +0800 Subject: [PATCH 1/3] fix path error --- .../tutorials/keras/text_classification.ipynb | 1960 ++++++++--------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1952 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- 6 files changed, 5844 insertions(+), 5844 deletions(-) diff --git a/site/en-snapshot/tutorials/keras/text_classification.ipynb b/site/en-snapshot/tutorials/keras/text_classification.ipynb index 4182c3f295..f6c1506722 100644 --- a/site/en-snapshot/tutorials/keras/text_classification.ipynb +++ b/site/en-snapshot/tutorials/keras/text_classification.ipynb @@ -1,982 +1,982 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Basic text classification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " View on TensorFlow.org\n", - " \n", - " Run in Google Colab\n", - " \n", - " View source on GitHub\n", - " \n", - " Download notebook\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "This tutorial demonstrates text classification starting from plain text files stored on disk. You'll train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try, in which you'll train a multi-class classifier to predict the tag for a programming question on Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Sentiment analysis\n", - "\n", - "This notebook trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the text of the review. This is an example of *binary*—or two-class—classification, an important and widely applicable kind of machine learning problem.\n", - "\n", - "You'll use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/). These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are *balanced*, meaning they contain an equal number of positive and negative reviews.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Download and explore the IMDB dataset\n", - "\n", - "Let's download and extract the dataset, then explore the directory structure." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "The `aclImdb/train/pos` and `aclImdb/train/neg` directories contain many text files, each of which is a single movie review. Let's take a look at one of them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Load the dataset\n", - "\n", - "Next, you will load the data off disk and prepare it into a format suitable for training. To do so, you will use the helpful [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) utility, which expects a directory structure as follows.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "To prepare a dataset for binary classification, you will need two folders on disk, corresponding to `class_a` and `class_b`. These will be the positive and negative movie reviews, which can be found in `aclImdb/train/pos` and `aclImdb/train/neg`. As the IMDB dataset contains additional folders, you will remove them before using this utility." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Next, you will use the `text_dataset_from_directory` utility to create a labeled `tf.data.Dataset`. [tf.data](https://www.tensorflow.org/guide/data) is a powerful collection of tools for working with data. \n", - "\n", - "When running a machine learning experiment, it is a best practice to divide your dataset into three splits: [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set), and [test](https://developers.google.com/machine-learning/glossary#test-set). \n", - "\n", - "The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation set using an 80:20 split of the training data by using the `validation_split` argument below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "As you can see above, there are 25,000 examples in the training folder, of which you will use 80% (or 20,000) for training. As you will see in a moment, you can train a model by passing a dataset directly to `model.fit`. If you're new to `tf.data`, you can also iterate over the dataset and print out a few examples as follows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Notice the reviews contain raw text (with punctuation and occasional HTML tags like `
`). You will show how to handle these in the following section. \n", - "\n", - "The labels are 0 or 1. To see which of these correspond to positive and negative movie reviews, you can check the `class_names` property on the dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "Next, you will create a validation and test dataset. You will use the remaining 5,000 reviews from the training set for validation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Note: When using the `validation_split` and `subset` arguments, make sure to either specify a random seed, or to pass `shuffle=False`, so that the validation and training splits have no overlap." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Prepare the dataset for training\n", - "\n", - "Next, you will standardize, tokenize, and vectorize the data using the helpful `tf.keras.layers.TextVectorization` layer. \n", - "\n", - "Standardization refers to preprocessing the text, typically to remove punctuation or HTML elements to simplify the dataset. Tokenization refers to splitting strings into tokens (for example, splitting a sentence into individual words, by splitting on whitespace). Vectorization refers to converting tokens into numbers so they can be fed into a neural network. All of these tasks can be accomplished with this layer.\n", - "\n", - "As you saw above, the reviews contain various HTML tags like `
`. These tags will not be removed by the default standardizer in the `TextVectorization` layer (which converts text to lowercase and strips punctuation by default, but doesn't strip HTML). You will write a custom standardization function to remove the HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Note: To prevent [training-testing skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (also known as training-serving skew), it is important to preprocess the data identically at train and test time. To facilitate this, the `TextVectorization` layer can be included directly inside your model, as shown later in this tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Next, you will create a `TextVectorization` layer. You will use this layer to standardize, tokenize, and vectorize our data. You set the `output_mode` to `int` to create unique integer indices for each token.\n", - "\n", - "Note that you're using the default split function, and the custom standardization function you defined above. You'll also define some constants for the model, like an explicit maximum `sequence_length`, which will cause the layer to pad or truncate sequences to exactly `sequence_length` values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "Next, you will call `adapt` to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Note: It's important to only use your training data when calling adapt (using the test set would leak information)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Let's create a function to see the result of using this layer to preprocess some data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "As you can see above, each token has been replaced by an integer. You can lookup the token (string) that each integer corresponds to by calling `.get_vocabulary()` on the layer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "You are nearly ready to train your model. As a final preprocessing step, you will apply the TextVectorization layer you created earlier to the train, validation, and test dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configure the dataset for performance\n", - "\n", - "These are two important methods you should use when loading data to make sure that I/O does not become blocking.\n", - "\n", - "`.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.\n", - "\n", - "`.prefetch()` overlaps data preprocessing and model execution while training. \n", - "\n", - "You can learn more about both methods, as well as how to cache data to disk in the [data performance guide](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Create the model\n", - "\n", - "It's time to create your neural network:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "The layers are stacked sequentially to build the classifier:\n", - "\n", - "1. The first layer is an `Embedding` layer. This layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: `(batch, sequence, embedding)`. To learn more about embeddings, check out the [Word embeddings](https://www.tensorflow.org/text/guide/word_embeddings) tutorial.\n", - "2. Next, a `GlobalAveragePooling1D` layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.\n", - "3. The last layer is densely connected with a single output node." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Loss function and optimizer\n", - "\n", - "A model needs a loss function and an optimizer for training. Since this is a binary classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), you'll use `losses.BinaryCrossentropy` loss function.\n", - "\n", - "Now, configure the model to use an optimizer and a loss function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Train the model\n", - "\n", - "You will train the model by passing the `dataset` object to the fit method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Evaluate the model\n", - "\n", - "Let's see how the model performs. Two values will be returned. Loss (a number which represents our error, lower values are better), and accuracy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "This fairly naive approach achieves an accuracy of about 86%." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Create a plot of accuracy and loss over time\n", - "\n", - "`model.fit()` returns a `History` object that contains a dictionary with everything that happened during training:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "There are four entries: one for each monitored metric during training and validation. You can use these to plot the training and validation loss for comparison, as well as the training and validation accuracy:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.\n", - "\n", - "Notice the training loss *decreases* with each epoch and the training accuracy *increases* with each epoch. This is expected when using a gradient descent optimization—it should minimize the desired quantity on every iteration.\n", - "\n", - "This isn't the case for the validation loss and accuracy—they seem to peak before the training accuracy. This is an example of overfitting: the model performs better on the training data than it does on data it has never seen before. After this point, the model over-optimizes and learns representations *specific* to the training data that do not *generalize* to test data.\n", - "\n", - "For this particular case, you could prevent overfitting by simply stopping the training when the validation accuracy is no longer increasing. One way to do so is to use the `tf.keras.callbacks.EarlyStopping` callback." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Export the model\n", - "\n", - "In the code above, you applied the `TextVectorization` layer to the dataset before feeding text to the model. If you want to make your model capable of processing raw strings (for example, to simplify deploying it), you can include the `TextVectorization` layer inside your model. To do so, you can create a new model using the weights you just trained." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inference on new data\n", - "\n", - "To get predictions for new examples, you can simply call `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Including the text preprocessing logic inside your model enables you to export a model for production that simplifies deployment, and reduces the potential for [train/test skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "There is a performance difference to keep in mind when choosing where to apply your TextVectorization layer. Using it outside of your model enables you to do asynchronous CPU processing and buffering of your data when training on GPU. So, if you're training your model on the GPU, you probably want to go with this option to get the best performance while developing your model, then switch to including the TextVectorization layer inside your model when you're ready to prepare for deployment.\n", - "\n", - "Visit this [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) to learn more about saving models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Exercise: multi-class classification on Stack Overflow questions\n", - "\n", - "This tutorial showed how to train a binary classifier from scratch on the IMDB dataset. As an exercise, you can modify this notebook to train a multi-class classifier to predict the tag of a programming question on [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "A [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) has been prepared for you to use containing the body of several thousand programming questions (for example, \"How can I sort a dictionary by value in Python?\") posted to Stack Overflow. Each of these is labeled with exactly one tag (either Python, CSharp, JavaScript, or Java). Your task is to take a question as input, and predict the appropriate tag, in this case, Python. \n", - "\n", - "The dataset you will work with contains several thousand questions extracted from the much larger public Stack Overflow dataset on [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), which contains more than 17 million posts.\n", - "\n", - "After downloading the dataset, you will find it has a similar directory structure to the IMDB dataset you worked with previously:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Note: To increase the difficulty of the classification problem, occurrences of the words Python, CSharp, JavaScript, or Java in the programming questions have been replaced with the word *blank* (as many questions contain the language they're about).\n", - "\n", - "To complete this exercise, you should modify this notebook to work with the Stack Overflow dataset by making the following modifications:\n", - "\n", - "1. At the top of your notebook, update the code that downloads the IMDB dataset with code to download the [Stack Overflow dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) that has already been prepared. As the Stack Overflow dataset has a similar directory structure, you will not need to make many modifications.\n", - "\n", - "1. Modify the last layer of your model to `Dense(4)`, as there are now four output classes.\n", - "\n", - "1. When compiling the model, change the loss to `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. This is the correct loss function to use for a multi-class classification problem, when the labels for each class are integers (in this case, they can be 0, *1*, *2*, or *3*). In addition, change the metrics to `metrics=['accuracy']`, since this is a multi-class classification problem (`tf.metrics.BinaryAccuracy` is only used for binary classifiers).\n", - "\n", - "1. When plotting accuracy over time, change `binary_accuracy` and `val_binary_accuracy` to `accuracy` and `val_accuracy`, respectively.\n", - "\n", - "1. Once these changes are complete, you will be able to train a multi-class classifier. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Learning more\n", - "\n", - "This tutorial introduced text classification from scratch. To learn more about the text classification workflow in general, check out the [Text classification guide](https://developers.google.com/machine-learning/guides/text-classification/) from Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Basic text classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " View on TensorFlow.org\n", + " \n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + " \n", + " Download notebook\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "This tutorial demonstrates text classification starting from plain text files stored on disk. You'll train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try, in which you'll train a multi-class classifier to predict the tag for a programming question on Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Sentiment analysis\n", + "\n", + "This notebook trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the text of the review. This is an example of *binary*—or two-class—classification, an important and widely applicable kind of machine learning problem.\n", + "\n", + "You'll use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/). These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are *balanced*, meaning they contain an equal number of positive and negative reviews.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Download and explore the IMDB dataset\n", + "\n", + "Let's download and extract the dataset, then explore the directory structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "The `aclImdb/train/pos` and `aclImdb/train/neg` directories contain many text files, each of which is a single movie review. Let's take a look at one of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Load the dataset\n", + "\n", + "Next, you will load the data off disk and prepare it into a format suitable for training. To do so, you will use the helpful [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) utility, which expects a directory structure as follows.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "To prepare a dataset for binary classification, you will need two folders on disk, corresponding to `class_a` and `class_b`. These will be the positive and negative movie reviews, which can be found in `aclImdb/train/pos` and `aclImdb/train/neg`. As the IMDB dataset contains additional folders, you will remove them before using this utility." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Next, you will use the `text_dataset_from_directory` utility to create a labeled `tf.data.Dataset`. [tf.data](https://www.tensorflow.org/guide/data) is a powerful collection of tools for working with data. \n", + "\n", + "When running a machine learning experiment, it is a best practice to divide your dataset into three splits: [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set), and [test](https://developers.google.com/machine-learning/glossary#test-set). \n", + "\n", + "The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation set using an 80:20 split of the training data by using the `validation_split` argument below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "As you can see above, there are 25,000 examples in the training folder, of which you will use 80% (or 20,000) for training. As you will see in a moment, you can train a model by passing a dataset directly to `model.fit`. If you're new to `tf.data`, you can also iterate over the dataset and print out a few examples as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Notice the reviews contain raw text (with punctuation and occasional HTML tags like `
`). You will show how to handle these in the following section. \n", + "\n", + "The labels are 0 or 1. To see which of these correspond to positive and negative movie reviews, you can check the `class_names` property on the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "Next, you will create a validation and test dataset. You will use the remaining 5,000 reviews from the training set for validation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Note: When using the `validation_split` and `subset` arguments, make sure to either specify a random seed, or to pass `shuffle=False`, so that the validation and training splits have no overlap." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Prepare the dataset for training\n", + "\n", + "Next, you will standardize, tokenize, and vectorize the data using the helpful `tf.keras.layers.TextVectorization` layer. \n", + "\n", + "Standardization refers to preprocessing the text, typically to remove punctuation or HTML elements to simplify the dataset. Tokenization refers to splitting strings into tokens (for example, splitting a sentence into individual words, by splitting on whitespace). Vectorization refers to converting tokens into numbers so they can be fed into a neural network. All of these tasks can be accomplished with this layer.\n", + "\n", + "As you saw above, the reviews contain various HTML tags like `
`. These tags will not be removed by the default standardizer in the `TextVectorization` layer (which converts text to lowercase and strips punctuation by default, but doesn't strip HTML). You will write a custom standardization function to remove the HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Note: To prevent [training-testing skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (also known as training-serving skew), it is important to preprocess the data identically at train and test time. To facilitate this, the `TextVectorization` layer can be included directly inside your model, as shown later in this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Next, you will create a `TextVectorization` layer. You will use this layer to standardize, tokenize, and vectorize our data. You set the `output_mode` to `int` to create unique integer indices for each token.\n", + "\n", + "Note that you're using the default split function, and the custom standardization function you defined above. You'll also define some constants for the model, like an explicit maximum `sequence_length`, which will cause the layer to pad or truncate sequences to exactly `sequence_length` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "Next, you will call `adapt` to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Note: It's important to only use your training data when calling adapt (using the test set would leak information)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Let's create a function to see the result of using this layer to preprocess some data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "As you can see above, each token has been replaced by an integer. You can lookup the token (string) that each integer corresponds to by calling `.get_vocabulary()` on the layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "You are nearly ready to train your model. As a final preprocessing step, you will apply the TextVectorization layer you created earlier to the train, validation, and test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configure the dataset for performance\n", + "\n", + "These are two important methods you should use when loading data to make sure that I/O does not become blocking.\n", + "\n", + "`.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.\n", + "\n", + "`.prefetch()` overlaps data preprocessing and model execution while training. \n", + "\n", + "You can learn more about both methods, as well as how to cache data to disk in the [data performance guide](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Create the model\n", + "\n", + "It's time to create your neural network:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "The layers are stacked sequentially to build the classifier:\n", + "\n", + "1. The first layer is an `Embedding` layer. This layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: `(batch, sequence, embedding)`. To learn more about embeddings, check out the [Word embeddings](https://www.tensorflow.org/text/guide/word_embeddings) tutorial.\n", + "2. Next, a `GlobalAveragePooling1D` layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.\n", + "3. The last layer is densely connected with a single output node." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Loss function and optimizer\n", + "\n", + "A model needs a loss function and an optimizer for training. Since this is a binary classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), you'll use `losses.BinaryCrossentropy` loss function.\n", + "\n", + "Now, configure the model to use an optimizer and a loss function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Train the model\n", + "\n", + "You will train the model by passing the `dataset` object to the fit method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Evaluate the model\n", + "\n", + "Let's see how the model performs. Two values will be returned. Loss (a number which represents our error, lower values are better), and accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "This fairly naive approach achieves an accuracy of about 86%." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Create a plot of accuracy and loss over time\n", + "\n", + "`model.fit()` returns a `History` object that contains a dictionary with everything that happened during training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "There are four entries: one for each monitored metric during training and validation. You can use these to plot the training and validation loss for comparison, as well as the training and validation accuracy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.\n", + "\n", + "Notice the training loss *decreases* with each epoch and the training accuracy *increases* with each epoch. This is expected when using a gradient descent optimization—it should minimize the desired quantity on every iteration.\n", + "\n", + "This isn't the case for the validation loss and accuracy—they seem to peak before the training accuracy. This is an example of overfitting: the model performs better on the training data than it does on data it has never seen before. After this point, the model over-optimizes and learns representations *specific* to the training data that do not *generalize* to test data.\n", + "\n", + "For this particular case, you could prevent overfitting by simply stopping the training when the validation accuracy is no longer increasing. One way to do so is to use the `tf.keras.callbacks.EarlyStopping` callback." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Export the model\n", + "\n", + "In the code above, you applied the `TextVectorization` layer to the dataset before feeding text to the model. If you want to make your model capable of processing raw strings (for example, to simplify deploying it), you can include the `TextVectorization` layer inside your model. To do so, you can create a new model using the weights you just trained." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inference on new data\n", + "\n", + "To get predictions for new examples, you can simply call `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Including the text preprocessing logic inside your model enables you to export a model for production that simplifies deployment, and reduces the potential for [train/test skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "There is a performance difference to keep in mind when choosing where to apply your TextVectorization layer. Using it outside of your model enables you to do asynchronous CPU processing and buffering of your data when training on GPU. So, if you're training your model on the GPU, you probably want to go with this option to get the best performance while developing your model, then switch to including the TextVectorization layer inside your model when you're ready to prepare for deployment.\n", + "\n", + "Visit this [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) to learn more about saving models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Exercise: multi-class classification on Stack Overflow questions\n", + "\n", + "This tutorial showed how to train a binary classifier from scratch on the IMDB dataset. As an exercise, you can modify this notebook to train a multi-class classifier to predict the tag of a programming question on [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "A [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) has been prepared for you to use containing the body of several thousand programming questions (for example, \"How can I sort a dictionary by value in Python?\") posted to Stack Overflow. Each of these is labeled with exactly one tag (either Python, CSharp, JavaScript, or Java). Your task is to take a question as input, and predict the appropriate tag, in this case, Python. \n", + "\n", + "The dataset you will work with contains several thousand questions extracted from the much larger public Stack Overflow dataset on [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), which contains more than 17 million posts.\n", + "\n", + "After downloading the dataset, you will find it has a similar directory structure to the IMDB dataset you worked with previously:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Note: To increase the difficulty of the classification problem, occurrences of the words Python, CSharp, JavaScript, or Java in the programming questions have been replaced with the word *blank* (as many questions contain the language they're about).\n", + "\n", + "To complete this exercise, you should modify this notebook to work with the Stack Overflow dataset by making the following modifications:\n", + "\n", + "1. At the top of your notebook, update the code that downloads the IMDB dataset with code to download the [Stack Overflow dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) that has already been prepared. As the Stack Overflow dataset has a similar directory structure, you will not need to make many modifications.\n", + "\n", + "1. Modify the last layer of your model to `Dense(4)`, as there are now four output classes.\n", + "\n", + "1. When compiling the model, change the loss to `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. This is the correct loss function to use for a multi-class classification problem, when the labels for each class are integers (in this case, they can be 0, *1*, *2*, or *3*). In addition, change the metrics to `metrics=['accuracy']`, since this is a multi-class classification problem (`tf.metrics.BinaryAccuracy` is only used for binary classifiers).\n", + "\n", + "1. When plotting accuracy over time, change `binary_accuracy` and `val_binary_accuracy` to `accuracy` and `val_accuracy`, respectively.\n", + "\n", + "1. Once these changes are complete, you will be able to train a multi-class classifier. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Learning more\n", + "\n", + "This tutorial introduced text classification from scratch. To learn more about the text classification workflow in general, check out the [Text classification guide](https://developers.google.com/machine-learning/guides/text-classification/) from Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/es-419/tutorials/keras/text_classification.ipynb b/site/es-419/tutorials/keras/text_classification.ipynb index 889018a71c..c49131f1b0 100644 --- a/site/es-419/tutorials/keras/text_classification.ipynb +++ b/site/es-419/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Clasificación básica de textos" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
Ver en TensorFlow.orgEjecutar en Google Colab Ver fuente en GitHub Descargar notebook
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "En este tutorial se muestra la clasificación de textos a partir de archivos de texto plano almacenados en un disco. Entrenará un clasificador binario para que analice los sentimientos de un conjunto de datos de IMDB. Al final del bloc de notas, hay un ejercicio para que lo ponga a prueba, en el que entrenará un clasificador multiclase para predecir la etiqueta de una pregunta de programación de Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Análisis de sentimientos\n", - "\n", - "En este bloc de notas se entrena un modelo de análisis de sentimiento para clasificar reseñas de películas como *positivas* o *negativas* a partir del texto de la reseña. Este es un ejemplo de clasificación *binaria* (o de dos clases), un tipo de problema de aprendizaje automático importante y ampliamente aplicable.\n", - "\n", - "Usará los [enormes conjuntos de datos de reseñas de películas](https://ai.stanford.edu/~amaas/data/sentiment/) que contienen el texto de 50 000 reseñas de películas de [Internet Movie Database](https://www.imdb.com/). Se divide en 25 000 reseñas para entrenamiento y 25 000 reseñas para prueba. Los conjuntos de entrenamiento y prueba están *equilibrados*, lo que significa que contienen la misma cantidad de reseñas positivas y negativas.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Descargar y explorar el conjunto de datos de IMDB\n", - "\n", - "Descarguemos y extraigamos los conjuntos de datos, luego, exploremos la estructura del directorio." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "Los directorios `aclImdb/train/pos` y `aclImdb/train/neg` contienen muchos archivos de texto, donde cada uno corresponde a una reseña de película. Echemos un vistazo a uno de ellos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Cargar el conjunto de datos\n", - "\n", - "A continuación, cargará los datos del disco y los preparará en un formato adecuado para el entrenamiento. Para esto, usará la práctica utilidad [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera una estructura de directorios como la que se muestra a continuación.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "Para preparar el conjunto de datos para clasificación binaria, necesita dos carpetas en el disco, que correspondan con `class_a` y `class_b`. Estas serán las reseñas positivas y negativas de las películas, que se pueden encontrar en `aclImdb/train/pos` y `aclImdb/train/neg`. Dado que el conjunto de datos de IMDB contiene carpetas adicionales, deberá eliminarlas antes de usar esta utilidad." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Luego, usará la utilidad `text_dataset_from_directory` para crear un `tf.data.Dataset` etiquetado. [tf.data](https://www.tensorflow.org/guide/data) es una potente colección de herramientas para trabajar con datos.\n", - "\n", - "A la hora de hacer un experimento de aprendizaje automático, lo mejor es dividir el conjunto de datos en tres partes: [entrenamiento](https://developers.google.com/machine-learning/glossary#training_set), [validación](https://developers.google.com/machine-learning/glossary#validation_set) y [prueba](https://developers.google.com/machine-learning/glossary#test-set).\n", - "\n", - "El conjunto de datos de IMDB ya está dividido en entrenamiento y prueba, pero no cuenta con un conjunto de validación. Creemos un conjunto de validación mediante una división 80:20 de los datos de entrenamiento con ayuda del argumento `validation_split` que se muestra a continuación." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "Como puede ver en el bloque de arriba, hay 25 000 ejemplos en la carpeta de entrenamiento, de lo que usará el 80 % (o 20 000) para entrenamiento. Como verá en un momento, puede entrenar un modelo pasando un conjunto de datos directamente a `model.fit`. Si es la primera vez que usa `tf.data`, también puede iterar el conjunto de datos e imprimir algunos ejemplos como se muestra a continuación." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Tenga en cuenta que las reseñas contienen texto bruto (con puntuación y algunas etiquetas HTML como `
`). En la siguiente sección le mostraremos cómo debe manejar esto.\n", - "\n", - "Las etiquetas son 0 o 1. Para ver cuál corresponde a las reseñas positivas y negativas de las películas, puede consultar la propiedad `class_names` en el conjunto de datos.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "A continuación, creará un conjunto de datos de validación y prueba. Usará las 5000 reseñas restantes del conjunto de entrenamiento para ejecutar la validación." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Nota: Cuando use los argumentos `validation_split` y `subset`, asegúrese de especificar una semilla o de pasar `shuffle=False`, para que las fracciones de validación y entrenamiento no se superpongan." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Preparar el conjunto de datos para entrenamiento\n", - "\n", - "A continuación, usará la útil capa `tf.keras.layers.TextVectorization` para estndarizar, tokenizar y vectorizar los datos.\n", - "\n", - "El término estandarización se refiere al preprocesamiento del texto que generalmente se utiliza para eliminar la puntuación o los elementos de HTML con el objetivo de simplificar el conjunto de datos. Tokenizar en este contexto es dividir las cadenas en tokens (por ejemplo, separar una frase en palabras individuales, usando los espacios en blanco para separar). La vetorización se refiere al proceso mediante el cual los tokens se convierten en números que se pueden cargar a la red neuronal. Todas estas tareas se pueden completar con esta capa.\n", - "\n", - "Como pudo ver anteriormente, las reseñas contienen varias etiquetas HTML como `
`. El estandarizador predeterminado de la capa `TextVectorization` (que convierte texto a minúsculas y elimina la puntuación de forma predeterminada, pero no elimina los elementos de HTML) no eliminará estas etiquetas. Deberá escribir una función de estandarización personalizada para eliminar el HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Nota: Para evitar el [sesgo entrenamiento-prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (también conocido como sesgo entrenamiento-servicio), es importante preprocesar los datos de forma idéntica tanto durante el entrenamiento como en la etapa de prueba. Para simplificar esto, la capa `TextVectorization` se puede incluir directamente dentro del modelo, como se muestra más adelante en este tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Luego, creará una capa `TextVectorization`. Usará esta capa para estandarizar, tokenizar y vectorizar nuestros datos. Configurará `output_mode` en `int` para crear índices enteros únicos para cada token.\n", - "\n", - "Tenga en cuenta que está usando la función de separación predeterminada y la función de estandarización personalizada que definió anteriormente. También deberá definir algunas constantes para el modelo, como un valor máximo explícito de `sequence_length`, que hará que cada capa amortigüe o trunque las secuencias exactamente a los valores `sequence_length`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "A continuación, llamará `adapt` para que ajuste el estado de la capa de preprocesamiento al conjunto de datos. Esto hará que el modelo convierta un índice de cadenas a enteros." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Nota: Es importante que solo use sus datos de entrenamiento para al llamar adapt (si usa el conjunto de prueba, se podría filtrar información)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Creemos una función para ver los resultados del uso de esta capa para preprocesar algunos datos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "Como pudo ver anteriormente, cada token ha sido reemplazo por un entero. Puede buscar el token (cadena) al que corresponde cada entero llamando `.get_vocabulary()` en la capa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "Ya está casi listo para entrenar su modelo. Como último paso de preprocesamiento, debe aplicar la capa TextVectorization que creó anteriormente a los conjuntos de datos de entrenamiento, validación y prueba." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configurar el conjunto de datos para mejorar el rendimiento\n", - "\n", - "Hay dos métodos importantes que debería usar al cargar los datos para asegurarse de que la E/S no se bloquee.\n", - "\n", - "`.cache()` conserva los datos en la memoria después de que descarga del disco. Esto evitará que el conjunto de datos se transforme en un cuello de botella mientras entrena su modelo. Si su conjunto de datos es demasiado grande para caber en la memoria, también puede usar este método para crear un potente caché en disco, que se lee de forma más eficiente que muchos archivos pequeños.\n", - "\n", - "`.prefetch()` superpone el preprocesamiento de los datos y la ejecución del modelo durante el entrenamiento.\n", - "\n", - "Puede obtener más información sobre ambos métodos y sobre cómo almacenar datos en caché en disco en la [guía de rendimiento de datos](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Crear el modelo\n", - "\n", - "Llegó la hora de que cree su red neuronal:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "Las capas se apilan secuencialmente para generar el clasificador:\n", - "\n", - "1. La primera capa es una capa `Embedding`. Esta capa toma las reseñas cifradas con números enteros y busca un vector de incorporación para cada índice de palabra. Estos vectores se aprenden a medida que se entrena el modelo. Los vectores agregan una dimensión al arreglo de salida. Las dimensiones resultantes son las siguientes: `(batch, sequence, embedding)`. Para obtener más información sobre las incorporaciones, consulte el tutorial [Incorporaciones de palabras](https://www.tensorflow.org/text/guide/word_embeddings).\n", - "2. A continuación, una capa `GlobalAveragePooling1D` devuelve un vector de salida de longitud fija para cada ejemplo calculando el promedio sobre la dimensión de la secuencia. Esto le permite a modelo manejar entradas de longitud variable, de la forma más sencilla posible.\n", - "3. La última capa está densamente conectada con un único nodo de salida." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Función de pérdida y optimizador\n", - "\n", - "Un modelo necesita una función de pérdida y un optimizador para el entrenamiento. Dado que este es un problema de clasificación binaria y el modelo genera una probabilidad (una capa de una sola unidad con una activación sigmoide), usaremos la función de pérdida `losses.BinaryCrossentropy`.\n", - "\n", - "Ahora, configure el modelo para usar un optimizador y una función de pérdida:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Entrenar el modelo\n", - "\n", - "Entrenará el modelo pasando el objeto `dataset` al método fit." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Evaluar el modelo\n", - "\n", - "Veamos el rendimiento del modelo. Nos devolverá dos valores; la pérdida (un número que representa nuestro error, los valores bajos son mejores) y la precisión." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "Este enfoque bastante sencillo alcanza una precisión del 86 %." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Cree un gráfico de precisión y pérdida a lo largo del tiempo\n", - "\n", - "`model.fit()` devuelve un objeto `History` que contiene un diccionario con todo lo que pasó durante el entrenamiento:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "Hay cuatro entradas: una por cada métrica que se monitoreó durante el entrenamiento y la validación. Puede usarlas para trazar la pérdida de entrenamiento y validación para compararlas, puede hacer lo mismo con la precisión:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "En este gráfico, los puntos representan la pérdida y la precisión del entrenamiento y las líneas continuas reflejan la pérdida y la precisión de la validación.\n", - "\n", - "Como puede ver, la pérdida del entrenamiento *se reduce* época tras época y la precisión del entrenamiento *aumenta* a medida que pasan las épocas. Esto es lo que suele pasar cuando se usa una optimización con descenso de gradiente, debe reducir al mínimo la cantidad deseada en cada iteración.\n", - "\n", - "Esto no es lo que sucede en el caso de la pérdida y la precisión de la validación, al parecer llegan a su punto máximo antes que la precisión del entrenamiento. Este es un ejemplo de sobreajuste: el modelo funciona mejor con los datos de entrenamiento que con los datos que no ha visto anteriormente. Pasado este punto, el modelo se sobreoptimiza y aprende representaciones *específicas* de los datos de entrenamiento que no se *generalizan* a los datos de prueba.\n", - "\n", - "En este caso particular, podría evitar el sobreajuste con tan solo detener el entrenamiento cuando la precisión de validación deje de aumentar. Una forma de hacerlo es con la retrollamada `tf.keras.callbacks.EarlyStopping`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Exportar el modelo\n", - "\n", - "En el código que vimos arriba, se aplicó la capa `TextVectorization` al conjunto de datos antes de cargar texto al modelo. Si desea que su modelo sea capaz de procesar cadenas sin procesar (por ejemplo, para simplificar la implementación), puede incluir la capa `TextVectorization` en su modelo. Para ello, puede crear un nuevo modelo a partir de los pesos que acaba de entrenar." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inferencia en los nuevos datos\n", - "\n", - "Para obtener predicciones para ejemplos nuevos, puede sencillamente llamar `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Incluir la lógica de preprocesamiento de textos en su modelo le permitirá exportar un modelo para producción que simplifique la implementación y reduzca la probabilidad de que se produzca un [sesgo entre entrenamiento y prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "Hay una diferencia de rendimiento que tenemos que tener en cuenta a la hora de elegir dónde aplicar la capa TextVectorization. Usarla fuera de su modelo le permite hacer un procesamiento asíncrono en CPU y almacenar en búfer los datos cuando se entrena en GPU. Por lo tanto, si está entrenando su modelo en GPU, probablemente debería elegir esta opción para obtener el mejor rendimiento mientras desarrolla su modelo, y luego cambiar para incluir la capa TextVectorization dentro de su modelo cuando esté listo para prepararse para la implementación.\n", - "\n", - "Visite este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para obtener más información sobre cómo guardar modelos." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Ejercicio: clasificación multiclase en preguntas de Stack Overflow\n", - "\n", - "En este tutorial, le mostramos cómo entrenar un clasificador binario desde cero con los conjuntos de datos de IMDB. A modo de ejercicio práctico, puede modificar este bloc de notas para entrenar un clasificador multiclase para predecir la etiqueta de una pregunta de programación en [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "Le preparamos un [conjunto de datos](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que contiene el cuerpo de varios miles de preguntas de programación, (por ejemplo, \"¿Como puedo ordenar un diccionario por valor en Python?\") que se publicaron en Stack Overflow. Cada una de ellas se etiquetó con una sola etiqueta (que puede ser Python, CSharp, JavaScript o Java). Su tarea consiste en tomar una pregunta como entrada y predecir la etiqueta correspondiente, en este caso, Python.\n", - "\n", - "El conjunto de datos con el que trabajará contiene miles de preguntas que fueron extraídas del conjunto de datos público de Stack Overflow en [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), que es mucho más grande y contiene más de 17 millones de publicaciones.\n", - "\n", - "Tras descargar el conjunto de datos, verá que tiene una estructura de directorio similar al conjunto de datos de IMDB con el que trabajó anteriormente:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Nota: Para elevar el nivel de dificultad del problema de clasificación, las apariciones de las palabras Python, CSharp, JavaScript o Java en las preguntas de programación han sido reemplazadas por las palabras *en blanco* (ya que muchas preguntas mencionan el lenguaje al que se refieren).\n", - "\n", - "Para completar este ejercicio, debería modificar este bloc de notas para trabajar con el conjunto de datos de Stack Overflow aplicando los siguientes cambios:\n", - "\n", - "1. En la parte superior del bloc de notas, actualice el código que descarga el conjunto de datos de IMDB con el código para descargar el [conjunto de datos de Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que ya ha sido preparado. Como el conjunto de datos de Stack Overflow tiene una estructura de directorio similar, no será necesario que realice muchas modificaciones.\n", - "\n", - "2. Modifique la última capa de su modelo para que sea `Dense(4)`, ya que ahora son cuatro las clases de salida.\n", - "\n", - "3. Cuando compile el modelo, cambie la pérdida a `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta es la función de pérdida adecuada para usar con un problema de clasificación multiclase, cuando las etiquetas de cada clase son números enteros (en este caso, pueden ser 0, *1*, *2* o *3*). Además, cambie las métricas a `metrics=['accuracy']`, ya que este es un problema de clasificación multiclase (`tf.metrics.BinaryAccuracy` se usa solamente para clasificadores binarios).\n", - "\n", - "4. A la hora de trazar la precisión a lo largo del tiempo, cambie `binary_accuracy` y `val_binary_accuracy` por `accuracy` y `val_accuracy`, respectivamente.\n", - "\n", - "5. Una vez que haya hecho todos estos cambios, estará listo para entrenar un clasificador multiclase. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Más información\n", - "\n", - "En este tutorial, le presentamos la clasificación de textos desde cero. Para obtener más información sobre el flujo de trabajo de la clasificación de textos en términos generales, consulte la [guía Clasificación de textos](https://developers.google.com/machine-learning/guides/text-classification/) de Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Clasificación básica de textos" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
Ver en TensorFlow.orgEjecutar en Google Colab Ver fuente en GitHub Descargar notebook
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "En este tutorial se muestra la clasificación de textos a partir de archivos de texto plano almacenados en un disco. Entrenará un clasificador binario para que analice los sentimientos de un conjunto de datos de IMDB. Al final del bloc de notas, hay un ejercicio para que lo ponga a prueba, en el que entrenará un clasificador multiclase para predecir la etiqueta de una pregunta de programación de Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Análisis de sentimientos\n", + "\n", + "En este bloc de notas se entrena un modelo de análisis de sentimiento para clasificar reseñas de películas como *positivas* o *negativas* a partir del texto de la reseña. Este es un ejemplo de clasificación *binaria* (o de dos clases), un tipo de problema de aprendizaje automático importante y ampliamente aplicable.\n", + "\n", + "Usará los [enormes conjuntos de datos de reseñas de películas](https://ai.stanford.edu/~amaas/data/sentiment/) que contienen el texto de 50 000 reseñas de películas de [Internet Movie Database](https://www.imdb.com/). Se divide en 25 000 reseñas para entrenamiento y 25 000 reseñas para prueba. Los conjuntos de entrenamiento y prueba están *equilibrados*, lo que significa que contienen la misma cantidad de reseñas positivas y negativas.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Descargar y explorar el conjunto de datos de IMDB\n", + "\n", + "Descarguemos y extraigamos los conjuntos de datos, luego, exploremos la estructura del directorio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "Los directorios `aclImdb/train/pos` y `aclImdb/train/neg` contienen muchos archivos de texto, donde cada uno corresponde a una reseña de película. Echemos un vistazo a uno de ellos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Cargar el conjunto de datos\n", + "\n", + "A continuación, cargará los datos del disco y los preparará en un formato adecuado para el entrenamiento. Para esto, usará la práctica utilidad [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera una estructura de directorios como la que se muestra a continuación.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "Para preparar el conjunto de datos para clasificación binaria, necesita dos carpetas en el disco, que correspondan con `class_a` y `class_b`. Estas serán las reseñas positivas y negativas de las películas, que se pueden encontrar en `aclImdb/train/pos` y `aclImdb/train/neg`. Dado que el conjunto de datos de IMDB contiene carpetas adicionales, deberá eliminarlas antes de usar esta utilidad." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Luego, usará la utilidad `text_dataset_from_directory` para crear un `tf.data.Dataset` etiquetado. [tf.data](https://www.tensorflow.org/guide/data) es una potente colección de herramientas para trabajar con datos.\n", + "\n", + "A la hora de hacer un experimento de aprendizaje automático, lo mejor es dividir el conjunto de datos en tres partes: [entrenamiento](https://developers.google.com/machine-learning/glossary#training_set), [validación](https://developers.google.com/machine-learning/glossary#validation_set) y [prueba](https://developers.google.com/machine-learning/glossary#test-set).\n", + "\n", + "El conjunto de datos de IMDB ya está dividido en entrenamiento y prueba, pero no cuenta con un conjunto de validación. Creemos un conjunto de validación mediante una división 80:20 de los datos de entrenamiento con ayuda del argumento `validation_split` que se muestra a continuación." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "Como puede ver en el bloque de arriba, hay 25 000 ejemplos en la carpeta de entrenamiento, de lo que usará el 80 % (o 20 000) para entrenamiento. Como verá en un momento, puede entrenar un modelo pasando un conjunto de datos directamente a `model.fit`. Si es la primera vez que usa `tf.data`, también puede iterar el conjunto de datos e imprimir algunos ejemplos como se muestra a continuación." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Tenga en cuenta que las reseñas contienen texto bruto (con puntuación y algunas etiquetas HTML como `
`). En la siguiente sección le mostraremos cómo debe manejar esto.\n", + "\n", + "Las etiquetas son 0 o 1. Para ver cuál corresponde a las reseñas positivas y negativas de las películas, puede consultar la propiedad `class_names` en el conjunto de datos.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "A continuación, creará un conjunto de datos de validación y prueba. Usará las 5000 reseñas restantes del conjunto de entrenamiento para ejecutar la validación." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Nota: Cuando use los argumentos `validation_split` y `subset`, asegúrese de especificar una semilla o de pasar `shuffle=False`, para que las fracciones de validación y entrenamiento no se superpongan." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Preparar el conjunto de datos para entrenamiento\n", + "\n", + "A continuación, usará la útil capa `tf.keras.layers.TextVectorization` para estndarizar, tokenizar y vectorizar los datos.\n", + "\n", + "El término estandarización se refiere al preprocesamiento del texto que generalmente se utiliza para eliminar la puntuación o los elementos de HTML con el objetivo de simplificar el conjunto de datos. Tokenizar en este contexto es dividir las cadenas en tokens (por ejemplo, separar una frase en palabras individuales, usando los espacios en blanco para separar). La vetorización se refiere al proceso mediante el cual los tokens se convierten en números que se pueden cargar a la red neuronal. Todas estas tareas se pueden completar con esta capa.\n", + "\n", + "Como pudo ver anteriormente, las reseñas contienen varias etiquetas HTML como `
`. El estandarizador predeterminado de la capa `TextVectorization` (que convierte texto a minúsculas y elimina la puntuación de forma predeterminada, pero no elimina los elementos de HTML) no eliminará estas etiquetas. Deberá escribir una función de estandarización personalizada para eliminar el HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Nota: Para evitar el [sesgo entrenamiento-prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (también conocido como sesgo entrenamiento-servicio), es importante preprocesar los datos de forma idéntica tanto durante el entrenamiento como en la etapa de prueba. Para simplificar esto, la capa `TextVectorization` se puede incluir directamente dentro del modelo, como se muestra más adelante en este tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Luego, creará una capa `TextVectorization`. Usará esta capa para estandarizar, tokenizar y vectorizar nuestros datos. Configurará `output_mode` en `int` para crear índices enteros únicos para cada token.\n", + "\n", + "Tenga en cuenta que está usando la función de separación predeterminada y la función de estandarización personalizada que definió anteriormente. También deberá definir algunas constantes para el modelo, como un valor máximo explícito de `sequence_length`, que hará que cada capa amortigüe o trunque las secuencias exactamente a los valores `sequence_length`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "A continuación, llamará `adapt` para que ajuste el estado de la capa de preprocesamiento al conjunto de datos. Esto hará que el modelo convierta un índice de cadenas a enteros." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Nota: Es importante que solo use sus datos de entrenamiento para al llamar adapt (si usa el conjunto de prueba, se podría filtrar información)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Creemos una función para ver los resultados del uso de esta capa para preprocesar algunos datos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "Como pudo ver anteriormente, cada token ha sido reemplazo por un entero. Puede buscar el token (cadena) al que corresponde cada entero llamando `.get_vocabulary()` en la capa." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "Ya está casi listo para entrenar su modelo. Como último paso de preprocesamiento, debe aplicar la capa TextVectorization que creó anteriormente a los conjuntos de datos de entrenamiento, validación y prueba." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configurar el conjunto de datos para mejorar el rendimiento\n", + "\n", + "Hay dos métodos importantes que debería usar al cargar los datos para asegurarse de que la E/S no se bloquee.\n", + "\n", + "`.cache()` conserva los datos en la memoria después de que descarga del disco. Esto evitará que el conjunto de datos se transforme en un cuello de botella mientras entrena su modelo. Si su conjunto de datos es demasiado grande para caber en la memoria, también puede usar este método para crear un potente caché en disco, que se lee de forma más eficiente que muchos archivos pequeños.\n", + "\n", + "`.prefetch()` superpone el preprocesamiento de los datos y la ejecución del modelo durante el entrenamiento.\n", + "\n", + "Puede obtener más información sobre ambos métodos y sobre cómo almacenar datos en caché en disco en la [guía de rendimiento de datos](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Crear el modelo\n", + "\n", + "Llegó la hora de que cree su red neuronal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "Las capas se apilan secuencialmente para generar el clasificador:\n", + "\n", + "1. La primera capa es una capa `Embedding`. Esta capa toma las reseñas cifradas con números enteros y busca un vector de incorporación para cada índice de palabra. Estos vectores se aprenden a medida que se entrena el modelo. Los vectores agregan una dimensión al arreglo de salida. Las dimensiones resultantes son las siguientes: `(batch, sequence, embedding)`. Para obtener más información sobre las incorporaciones, consulte el tutorial [Incorporaciones de palabras](https://www.tensorflow.org/text/guide/word_embeddings).\n", + "2. A continuación, una capa `GlobalAveragePooling1D` devuelve un vector de salida de longitud fija para cada ejemplo calculando el promedio sobre la dimensión de la secuencia. Esto le permite a modelo manejar entradas de longitud variable, de la forma más sencilla posible.\n", + "3. La última capa está densamente conectada con un único nodo de salida." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Función de pérdida y optimizador\n", + "\n", + "Un modelo necesita una función de pérdida y un optimizador para el entrenamiento. Dado que este es un problema de clasificación binaria y el modelo genera una probabilidad (una capa de una sola unidad con una activación sigmoide), usaremos la función de pérdida `losses.BinaryCrossentropy`.\n", + "\n", + "Ahora, configure el modelo para usar un optimizador y una función de pérdida:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Entrenar el modelo\n", + "\n", + "Entrenará el modelo pasando el objeto `dataset` al método fit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Evaluar el modelo\n", + "\n", + "Veamos el rendimiento del modelo. Nos devolverá dos valores; la pérdida (un número que representa nuestro error, los valores bajos son mejores) y la precisión." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "Este enfoque bastante sencillo alcanza una precisión del 86 %." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Cree un gráfico de precisión y pérdida a lo largo del tiempo\n", + "\n", + "`model.fit()` devuelve un objeto `History` que contiene un diccionario con todo lo que pasó durante el entrenamiento:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "Hay cuatro entradas: una por cada métrica que se monitoreó durante el entrenamiento y la validación. Puede usarlas para trazar la pérdida de entrenamiento y validación para compararlas, puede hacer lo mismo con la precisión:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "En este gráfico, los puntos representan la pérdida y la precisión del entrenamiento y las líneas continuas reflejan la pérdida y la precisión de la validación.\n", + "\n", + "Como puede ver, la pérdida del entrenamiento *se reduce* época tras época y la precisión del entrenamiento *aumenta* a medida que pasan las épocas. Esto es lo que suele pasar cuando se usa una optimización con descenso de gradiente, debe reducir al mínimo la cantidad deseada en cada iteración.\n", + "\n", + "Esto no es lo que sucede en el caso de la pérdida y la precisión de la validación, al parecer llegan a su punto máximo antes que la precisión del entrenamiento. Este es un ejemplo de sobreajuste: el modelo funciona mejor con los datos de entrenamiento que con los datos que no ha visto anteriormente. Pasado este punto, el modelo se sobreoptimiza y aprende representaciones *específicas* de los datos de entrenamiento que no se *generalizan* a los datos de prueba.\n", + "\n", + "En este caso particular, podría evitar el sobreajuste con tan solo detener el entrenamiento cuando la precisión de validación deje de aumentar. Una forma de hacerlo es con la retrollamada `tf.keras.callbacks.EarlyStopping`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Exportar el modelo\n", + "\n", + "En el código que vimos arriba, se aplicó la capa `TextVectorization` al conjunto de datos antes de cargar texto al modelo. Si desea que su modelo sea capaz de procesar cadenas sin procesar (por ejemplo, para simplificar la implementación), puede incluir la capa `TextVectorization` en su modelo. Para ello, puede crear un nuevo modelo a partir de los pesos que acaba de entrenar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inferencia en los nuevos datos\n", + "\n", + "Para obtener predicciones para ejemplos nuevos, puede sencillamente llamar `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Incluir la lógica de preprocesamiento de textos en su modelo le permitirá exportar un modelo para producción que simplifique la implementación y reduzca la probabilidad de que se produzca un [sesgo entre entrenamiento y prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "Hay una diferencia de rendimiento que tenemos que tener en cuenta a la hora de elegir dónde aplicar la capa TextVectorization. Usarla fuera de su modelo le permite hacer un procesamiento asíncrono en CPU y almacenar en búfer los datos cuando se entrena en GPU. Por lo tanto, si está entrenando su modelo en GPU, probablemente debería elegir esta opción para obtener el mejor rendimiento mientras desarrolla su modelo, y luego cambiar para incluir la capa TextVectorization dentro de su modelo cuando esté listo para prepararse para la implementación.\n", + "\n", + "Visite este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para obtener más información sobre cómo guardar modelos." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Ejercicio: clasificación multiclase en preguntas de Stack Overflow\n", + "\n", + "En este tutorial, le mostramos cómo entrenar un clasificador binario desde cero con los conjuntos de datos de IMDB. A modo de ejercicio práctico, puede modificar este bloc de notas para entrenar un clasificador multiclase para predecir la etiqueta de una pregunta de programación en [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "Le preparamos un [conjunto de datos](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que contiene el cuerpo de varios miles de preguntas de programación, (por ejemplo, \"¿Como puedo ordenar un diccionario por valor en Python?\") que se publicaron en Stack Overflow. Cada una de ellas se etiquetó con una sola etiqueta (que puede ser Python, CSharp, JavaScript o Java). Su tarea consiste en tomar una pregunta como entrada y predecir la etiqueta correspondiente, en este caso, Python.\n", + "\n", + "El conjunto de datos con el que trabajará contiene miles de preguntas que fueron extraídas del conjunto de datos público de Stack Overflow en [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), que es mucho más grande y contiene más de 17 millones de publicaciones.\n", + "\n", + "Tras descargar el conjunto de datos, verá que tiene una estructura de directorio similar al conjunto de datos de IMDB con el que trabajó anteriormente:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Nota: Para elevar el nivel de dificultad del problema de clasificación, las apariciones de las palabras Python, CSharp, JavaScript o Java en las preguntas de programación han sido reemplazadas por las palabras *en blanco* (ya que muchas preguntas mencionan el lenguaje al que se refieren).\n", + "\n", + "Para completar este ejercicio, debería modificar este bloc de notas para trabajar con el conjunto de datos de Stack Overflow aplicando los siguientes cambios:\n", + "\n", + "1. En la parte superior del bloc de notas, actualice el código que descarga el conjunto de datos de IMDB con el código para descargar el [conjunto de datos de Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que ya ha sido preparado. Como el conjunto de datos de Stack Overflow tiene una estructura de directorio similar, no será necesario que realice muchas modificaciones.\n", + "\n", + "2. Modifique la última capa de su modelo para que sea `Dense(4)`, ya que ahora son cuatro las clases de salida.\n", + "\n", + "3. Cuando compile el modelo, cambie la pérdida a `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta es la función de pérdida adecuada para usar con un problema de clasificación multiclase, cuando las etiquetas de cada clase son números enteros (en este caso, pueden ser 0, *1*, *2* o *3*). Además, cambie las métricas a `metrics=['accuracy']`, ya que este es un problema de clasificación multiclase (`tf.metrics.BinaryAccuracy` se usa solamente para clasificadores binarios).\n", + "\n", + "4. A la hora de trazar la precisión a lo largo del tiempo, cambie `binary_accuracy` y `val_binary_accuracy` por `accuracy` y `val_accuracy`, respectivamente.\n", + "\n", + "5. Una vez que haya hecho todos estos cambios, estará listo para entrenar un clasificador multiclase. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Más información\n", + "\n", + "En este tutorial, le presentamos la clasificación de textos desde cero. Para obtener más información sobre el flujo de trabajo de la clasificación de textos en términos generales, consulte la [guía Clasificación de textos](https://developers.google.com/machine-learning/guides/text-classification/) de Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/ja/tutorials/keras/text_classification.ipynb b/site/ja/tutorials/keras/text_classification.ipynb index 4448bf8d3d..776ad320be 100644 --- a/site/ja/tutorials/keras/text_classification.ipynb +++ b/site/ja/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 映画レビューのテキスト分類" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
TensorFlow.org で表示 Google Colab で実行 GitHub でソースを表示 ノートブックをダウンロード
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "このチュートリアルでは、ディスクに保存されているプレーンテキストファイルを使用してテキストを分類する方法について説明します。IMDB データセットでセンチメント分析を実行するように、二項分類器をトレーニングします。ノートブックの最後には、Stack Overflow のプログラミングに関する質問のタグを予測するためのマルチクラス分類器をトレーニングする演習があります。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## センチメント分析\n", - "\n", - "このノートブックでは、映画レビューのテキストを使用して、それが*肯定的*であるか*否定的*であるかに分類するようにセンチメント分析モデルをトレーニングします。これは*二項*分類の例で、機械学習問題では重要な分類法として広く適用されます。\n", - "\n", - "ここでは、[Internet Movie Database](https://ai.stanford.edu/~amaas/data/sentiment/) から抽出した 50,000 件の映画レビューを含む、[大規模なレビューデータセット](https://www.imdb.com/)を使います。レビューはトレーニング用とテスト用に 25,000 件ずつに分割されています。トレーニング用とテスト用のデータは均衡しています。言い換えると、それぞれが同数の肯定的及び否定的なレビューを含んでいます。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### IMDB データセットをダウンロードして調べる\n", - "\n", - "データセットをダウンロードして抽出してから、ディレクトリ構造を調べてみましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` および `aclImdb/train/neg` ディレクトリには多くのテキストファイルが含まれており、それぞれが 1 つの映画レビューです。それらの 1 つを見てみましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### データセットを読み込む\n", - "\n", - "次に、データをディスクから読み込み、トレーニングに適した形式に準備します。これを行うには、便利な [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) ユーティリティを使用します。このユーティリティは、次のようなディレクトリ構造を想定しています。\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "二項分類用のデータセットを準備するには、ディスクに `class_a` および `class_b`に対応する 2 つのフォルダが必要です。これらは、`aclImdb/train/pos` および `aclImdb/train/neg` にある肯定的および否定的な映画レビューになります。IMDB データセットには追加のフォルダーが含まれているため、このユーティリティを使用する前にそれらを削除します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "次に、`text_dataset_from_directory` ユーティリティを使用して、ラベル付きの `tf.data.Dataset` を作成します。[tf.data](https://www.tensorflow.org/guide/data) は、データを操作するための強力なツールのコレクションです。\n", - "\n", - "機械学習実験を実行するときは、データセットを[トレーニング](https://developers.google.com/machine-learning/glossary#training_set)、[検証](https://developers.google.com/machine-learning/glossary#validation_set)、および、[テスト](https://developers.google.com/machine-learning/glossary#test-set)の 3 つに分割することをお勧めします。\n", - "\n", - "IMDB データセットはすでにトレーニング用とテスト用に分割されていますが、検証セットはありません。以下の `validation_split` 引数を使用して、トレーニングデータの 80:20 分割を使用して検証セットを作成しましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "上記のように、トレーニングフォルダには 25,000 の例があり、そのうち 80% (20,000) をトレーニングに使用します。以下に示すとおり、データセットを `model.fit` に直接渡すことで、モデルをトレーニングできます。`tf.data` を初めて使用する場合は、データセットを繰り返し処理して、次のようにいくつかの例を出力することもできます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "レビューには生のテキストが含まれていることに注意してください(句読点や `
` などのような HTML タグが付いていることもあります)。次のセクションでは、これらの処理方法を示します。\n", - "\n", - "ラベルは 0 または 1 です。これらのどれが肯定的および否定的な映画レビューに対応するかを確認するには、データセットの `class_names` プロパティを確認できます。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "次に、検証およびテスト用データセットを作成します。トレーニング用セットの残りの 5,000 件のレビューを検証に使用します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "注意: `validation_split` および `subset` 引数を使用する場合は、必ずランダムシードを指定するか、`shuffle=False` を渡して、検証とトレーニング分割に重複がないようにします。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### トレーニング用データセットを準備する\n", - "\n", - "次に、便利な `tf.keras.layers.TextVectorization` レイヤーを使用して、データを標準化、トークン化、およびベクトル化します。\n", - "\n", - "標準化とは、テキストを前処理することを指します。通常、句読点や HTML 要素を削除して、データセットを簡素化します。トークン化とは、文字列をトークンに分割することです (たとえば、空白で分割することにより、文を個々の単語に分割します)。ベクトル化とは、トークンを数値に変換して、ニューラルネットワークに入力できるようにすることです。これらのタスクはすべて、このレイヤーで実行できます。\n", - "\n", - "前述のとおり、レビューには `
` のようなさまざまな HTML タグが含まれています。これらのタグは、`TextVectorization` レイヤーのデフォルトの標準化機能によって削除されません (テキストを小文字に変換し、デフォルトで句読点を削除しますが、HTML は削除されません)。HTML を削除するカスタム標準化関数を作成します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "注意: [トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(トレーニング/サービングスキューとも呼ばれます)を防ぐには、トレーニング時とテスト時にデータを同じように前処理することが重要です。これを容易にするためには、このチュートリアルの後半で示すように、`TextVectorization` レイヤーをモデル内に直接含めます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "次に、`TextVectorization` レイヤーを作成します。このレイヤーを使用して、データを標準化、トークン化、およびベクトル化します。`output_mode` を `int` に設定して、トークンごとに一意の整数インデックスを作成します。\n", - "\n", - "デフォルトの分割関数と、上記で定義したカスタム標準化関数を使用していることに注意してください。また、明示的な最大値 `sequence_length` など、モデルの定数をいくつか定義します。これにより、レイヤーはシーケンスを正確に `sequence_length` 値にパディングまたは切り捨てます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "次に、`adapt` を呼び出して、前処理レイヤーの状態をデータセットに適合させます。これにより、モデルは文字列から整数へのインデックスを作成します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "注意: Adapt を呼び出すときは、トレーニング用データのみを使用することが重要です(テスト用セットを使用すると情報が漏洩します)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "このレイヤーを使用して一部のデータを前処理した結果を確認する関数を作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "上記のように、各トークンは整数に置き換えられています。レイヤーで `.get_vocabulary()` を呼び出すことにより、各整数が対応するトークン(文字列)を検索できます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "モデルをトレーニングする準備がほぼ整いました。最後の前処理ステップとして、トレーニング、検証、およびデータセットのテストのために前に作成した TextVectorization レイヤーを適用します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### データセットを構成してパフォーマンスを改善する\n", - "\n", - "以下は、I/O がブロックされないようにするためにデータを読み込むときに使用する必要がある 2 つの重要な方法です。\n", - "\n", - "`.cache()` はデータをディスクから読み込んだ後、データをメモリに保持します。これにより、モデルのトレーニング中にデータセットがボトルネックになることを回避できます。データセットが大きすぎてメモリに収まらない場合は、この方法を使用して、パフォーマンスの高いオンディスクキャッシュを作成することもできます。これは、多くの小さなファイルを読み込むより効率的です。\n", - "\n", - "`.prefetch()` はトレーニング中にデータの前処理とモデルの実行をオーバーラップさせます。\n", - "\n", - "以上の 2 つの方法とデータをディスクにキャッシュする方法についての詳細は、[データパフォーマンスガイド](https://www.tensorflow.org/guide/data_performance)を参照してください。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### モデルを作成する\n", - "\n", - "ニューラルネットワークを作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "これらのレイヤーは、分類器を構成するため一列に積み重ねられます。\n", - "\n", - "1. 最初のレイヤーは `Embedding` (埋め込み)レイヤーです。このレイヤーは、整数にエンコードされた語彙を受け取り、それぞれの単語インデックスに対応する埋め込みベクトルを検索します。埋め込みベクトルは、モデルのトレーニングの中で学習されます。ベクトル化のために、出力行列には次元が1つ追加されます。その結果、次元は、`(batch, sequence, embedding)` となります。埋め込みの詳細については、[単語埋め込みチュートリアル](https://www.tensorflow.org/text/guide/word_embeddings)を参照してください。\n", - "2. 次は、`GlobalAveragePooling1D`(1次元のグローバル平均プーリング)レイヤーです。このレイヤーは、それぞれのサンプルについて、シーケンスの次元方向に平均値をもとめ、固定長のベクトルを返します。この結果、モデルは最も単純な形で、可変長の入力を扱うことができるようになります。\n", - "3. 最後のレイヤーは、単一の出力ノードと密に接続されています。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 損失関数とオプティマイザ\n", - "\n", - "モデルをトレーニングするには、損失関数とオプティマイザが必要です。これは二項分類問題であり、モデルは確率(シグモイドアクティベーションを持つ単一ユニットレイヤー)を出力するため、`losses.BinaryCrossentropy` 損失関数を使用します。\n", - "\n", - "損失関数の候補はこれだけではありません。例えば、`mean_squared_error`(平均二乗誤差)を使うこともできます。しかし、一般的には、確率を扱うには`binary_crossentropy`の方が適しています。`binary_crossentropy`は、確率分布の間の「距離」を測定する尺度です。今回の場合には、真の分布と予測値の分布の間の距離ということになります。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### モデルをトレーニングする\n", - "\n", - "`dataset` オブジェクトを fit メソッドに渡すことにより、モデルをトレーニングします。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### モデルを評価する\n", - "\n", - "モデルがどのように実行するか見てみましょう。2 つの値が返されます。損失(誤差、値が低いほど良)と正確度です。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "この、かなり素朴なアプローチでも 86% 前後の正解度を達成しました。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 経時的な正解度と損失のグラフを作成する\n", - "\n", - "`model.fit()` は、トレーニング中に発生したすべての情報を詰まったディクショナリを含む `History` オブジェクトを返します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "トレーニングと検証中に監視されている各メトリックに対して 1 つずつ、計 4 つのエントリがあります。このエントリを使用して、トレーニングと検証の損失とトレーニングと検証の正解度を比較したグラフを作成することができます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "このグラフでは、点はトレーニングの損失と正解度を表し、実線は検証の損失と正解度を表します。\n", - "\n", - "トレーニングの損失がエポックごとに*下降*し、トレーニングの正解度がエポックごとに*上昇*していることに注目してください。これは、勾配下降最適化を使用しているときに見られる現象で、イテレーションごとに希望する量を最小化します。\n", - "\n", - "これは検証の損失と精度には当てはまりません。これらはトレーニング精度の前にピークに達しているようです。これが過適合の例で、モデルが、遭遇したことのないデータよりもトレーニングデータで優れたパフォーマンスを発揮する現象です。この後、モデルは過度に最適化し、テストデータに*一般化*しないトレーニングデータ*特有*の表現を学習します。\n", - "\n", - "この特定のケースでは、検証の正解度が向上しなくなったときにトレーニングを停止することにより、過適合を防ぐことができます。これを行うには、`tf.keras.callbacks.EarlyStopping` コールバックを使用することができます。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## モデルをエクスポートする\n", - "\n", - "上記のコードでは、モデルにテキストをフィードする前に、`TextVectorization` レイヤーをデータセットに適用しました。モデルで生の文字列を処理できるようにする場合 (たとえば、展開を簡素化するため)、モデル内に `TextVectorization` レイヤーを含めることができます。これを行うには、トレーニングしたばかりの重みを使用して新しいモデルを作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 新しいデータの推論\n", - "\n", - "新しい例の予測を取得するには、`model.predict()`を呼び出します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "モデル内にテキスト前処理ロジックを含めると、モデルを本番環境にエクスポートして展開を簡素化し、[トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)の可能性を減らすことができます。\n", - "\n", - "TextVectorization レイヤーを適用する場所を選択する際に性能の違いに留意する必要があります。モデルの外部で使用すると、GPU でトレーニングするときに非同期 CPU 処理とデータのバッファリングを行うことができます。したがって、GPU でモデルをトレーニングしている場合は、モデルの開発中に最高のパフォーマンスを得るためにこのオプションを使用し、デプロイの準備ができたらモデル内に TextVectorization レイヤーを含めるように切り替えることをお勧めします。\n", - "\n", - "モデルの保存の詳細については、この[チュートリアル](https://www.tensorflow.org/tutorials/keras/save_and_load)にアクセスしてください。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 演習:StackOverflow の質問に対するマルチクラス分類\n", - "\n", - "このチュートリアルでは、IMDB データセットで二項分類器を最初からトレーニングする方法を示しました。演習として、このノートブックを変更して、[Stack Overflow](http://stackoverflow.com/) のプログラミング質問のタグを予測するマルチクラス分類器をトレーニングできます。\n", - "\n", - "Stack Overflow に投稿された数千のプログラミングに関する質問(たとえば、「Python でディクショナリを値で並べ替える方法」)の本文を含む[データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)が用意されています。それぞれ、1 つのタグ(Python、CSharp、JavaScript、または Java のいずれか)でラベル付けされています。この演習では、質問を入力として受け取り、適切なタグ(この場合は Python)を予測します。\n", - "\n", - "使用するデータセットには、1,700 万件以上の投稿を含む [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) の大規模な StackOverflow パブリックデータセットから抽出された数千の質問が含まれています。\n", - "\n", - "データセットをダウンロードすると、以前に使用した IMDB データセットと同様のディレクトリ構造になっていることがわかります。\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "注意: 分類問題の難易度を上げるために、プログラミングの質問での Python、CSharp、JavaScript、または Java という単語は、*blank* という単語に置き換えられました(多くの質問には、対象の言語が含まれているため)。\n", - "\n", - "この演習を完了するには、、このノートブックを変更してStackOverflow データセットを操作する必要があります。次の変更を行います。\n", - "\n", - "1. ノートブックの上部で、IMDB データセットをダウンロードするコードを、事前に準備されている [Stack Overflow データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)をダウンロードするコードで更新します。Stack Overflow データセットは同様のディレクトリ構造を持っているため、多くの変更を加える必要はありません。\n", - "\n", - "2. 4 つの出力クラスがあるため、モデルの最後のレイヤーを `Dense(4)` に変更します。\n", - "\n", - "3. モデルをコンパイルするときは、損失を `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)` に変更します。これは、各クラスのラベルが整数である場合に、マルチクラス分類問題に使用する正しい損失関数です。(この場合、 0、*1*、*2*、または 3 のいずれかになります)。さらに、これはマルチクラス分類の問題であるため、メトリックを `metrics=['accuracy']` に変更します (tf.metrics.BinaryAccuracy はバイナリ分類器にのみ使用されます)。\n", - "\n", - "4. 経時的な精度をプロットする場合は、`binary_accuracy` および `val_binary_accuracy`をそれぞれ `accuracy` および `val_accuracy` に変更します。\n", - "\n", - "5. これらの変更が完了すると、マルチクラス分類器をトレーニングできるようになります。 " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 詳細\n", - "\n", - "このチュートリアルでは、最初からテキスト分類を実行する方法を紹介しました。一般的なテキスト分類ワークフローの詳細については、Google Developers の[テキスト分類ガイド](https://developers.google.com/machine-learning/guides/text-classification/)をご覧ください。\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 映画レビューのテキスト分類" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
TensorFlow.org で表示 Google Colab で実行 GitHub でソースを表示 ノートブックをダウンロード
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "このチュートリアルでは、ディスクに保存されているプレーンテキストファイルを使用してテキストを分類する方法について説明します。IMDB データセットでセンチメント分析を実行するように、二項分類器をトレーニングします。ノートブックの最後には、Stack Overflow のプログラミングに関する質問のタグを予測するためのマルチクラス分類器をトレーニングする演習があります。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## センチメント分析\n", + "\n", + "このノートブックでは、映画レビューのテキストを使用して、それが*肯定的*であるか*否定的*であるかに分類するようにセンチメント分析モデルをトレーニングします。これは*二項*分類の例で、機械学習問題では重要な分類法として広く適用されます。\n", + "\n", + "ここでは、[Internet Movie Database](https://ai.stanford.edu/~amaas/data/sentiment/) から抽出した 50,000 件の映画レビューを含む、[大規模なレビューデータセット](https://www.imdb.com/)を使います。レビューはトレーニング用とテスト用に 25,000 件ずつに分割されています。トレーニング用とテスト用のデータは均衡しています。言い換えると、それぞれが同数の肯定的及び否定的なレビューを含んでいます。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### IMDB データセットをダウンロードして調べる\n", + "\n", + "データセットをダウンロードして抽出してから、ディレクトリ構造を調べてみましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` および `aclImdb/train/neg` ディレクトリには多くのテキストファイルが含まれており、それぞれが 1 つの映画レビューです。それらの 1 つを見てみましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### データセットを読み込む\n", + "\n", + "次に、データをディスクから読み込み、トレーニングに適した形式に準備します。これを行うには、便利な [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) ユーティリティを使用します。このユーティリティは、次のようなディレクトリ構造を想定しています。\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "二項分類用のデータセットを準備するには、ディスクに `class_a` および `class_b`に対応する 2 つのフォルダが必要です。これらは、`aclImdb/train/pos` および `aclImdb/train/neg` にある肯定的および否定的な映画レビューになります。IMDB データセットには追加のフォルダーが含まれているため、このユーティリティを使用する前にそれらを削除します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "次に、`text_dataset_from_directory` ユーティリティを使用して、ラベル付きの `tf.data.Dataset` を作成します。[tf.data](https://www.tensorflow.org/guide/data) は、データを操作するための強力なツールのコレクションです。\n", + "\n", + "機械学習実験を実行するときは、データセットを[トレーニング](https://developers.google.com/machine-learning/glossary#training_set)、[検証](https://developers.google.com/machine-learning/glossary#validation_set)、および、[テスト](https://developers.google.com/machine-learning/glossary#test-set)の 3 つに分割することをお勧めします。\n", + "\n", + "IMDB データセットはすでにトレーニング用とテスト用に分割されていますが、検証セットはありません。以下の `validation_split` 引数を使用して、トレーニングデータの 80:20 分割を使用して検証セットを作成しましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "上記のように、トレーニングフォルダには 25,000 の例があり、そのうち 80% (20,000) をトレーニングに使用します。以下に示すとおり、データセットを `model.fit` に直接渡すことで、モデルをトレーニングできます。`tf.data` を初めて使用する場合は、データセットを繰り返し処理して、次のようにいくつかの例を出力することもできます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "レビューには生のテキストが含まれていることに注意してください(句読点や `
` などのような HTML タグが付いていることもあります)。次のセクションでは、これらの処理方法を示します。\n", + "\n", + "ラベルは 0 または 1 です。これらのどれが肯定的および否定的な映画レビューに対応するかを確認するには、データセットの `class_names` プロパティを確認できます。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "次に、検証およびテスト用データセットを作成します。トレーニング用セットの残りの 5,000 件のレビューを検証に使用します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "注意: `validation_split` および `subset` 引数を使用する場合は、必ずランダムシードを指定するか、`shuffle=False` を渡して、検証とトレーニング分割に重複がないようにします。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### トレーニング用データセットを準備する\n", + "\n", + "次に、便利な `tf.keras.layers.TextVectorization` レイヤーを使用して、データを標準化、トークン化、およびベクトル化します。\n", + "\n", + "標準化とは、テキストを前処理することを指します。通常、句読点や HTML 要素を削除して、データセットを簡素化します。トークン化とは、文字列をトークンに分割することです (たとえば、空白で分割することにより、文を個々の単語に分割します)。ベクトル化とは、トークンを数値に変換して、ニューラルネットワークに入力できるようにすることです。これらのタスクはすべて、このレイヤーで実行できます。\n", + "\n", + "前述のとおり、レビューには `
` のようなさまざまな HTML タグが含まれています。これらのタグは、`TextVectorization` レイヤーのデフォルトの標準化機能によって削除されません (テキストを小文字に変換し、デフォルトで句読点を削除しますが、HTML は削除されません)。HTML を削除するカスタム標準化関数を作成します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "注意: [トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(トレーニング/サービングスキューとも呼ばれます)を防ぐには、トレーニング時とテスト時にデータを同じように前処理することが重要です。これを容易にするためには、このチュートリアルの後半で示すように、`TextVectorization` レイヤーをモデル内に直接含めます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "次に、`TextVectorization` レイヤーを作成します。このレイヤーを使用して、データを標準化、トークン化、およびベクトル化します。`output_mode` を `int` に設定して、トークンごとに一意の整数インデックスを作成します。\n", + "\n", + "デフォルトの分割関数と、上記で定義したカスタム標準化関数を使用していることに注意してください。また、明示的な最大値 `sequence_length` など、モデルの定数をいくつか定義します。これにより、レイヤーはシーケンスを正確に `sequence_length` 値にパディングまたは切り捨てます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "次に、`adapt` を呼び出して、前処理レイヤーの状態をデータセットに適合させます。これにより、モデルは文字列から整数へのインデックスを作成します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "注意: Adapt を呼び出すときは、トレーニング用データのみを使用することが重要です(テスト用セットを使用すると情報が漏洩します)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "このレイヤーを使用して一部のデータを前処理した結果を確認する関数を作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "上記のように、各トークンは整数に置き換えられています。レイヤーで `.get_vocabulary()` を呼び出すことにより、各整数が対応するトークン(文字列)を検索できます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "モデルをトレーニングする準備がほぼ整いました。最後の前処理ステップとして、トレーニング、検証、およびデータセットのテストのために前に作成した TextVectorization レイヤーを適用します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### データセットを構成してパフォーマンスを改善する\n", + "\n", + "以下は、I/O がブロックされないようにするためにデータを読み込むときに使用する必要がある 2 つの重要な方法です。\n", + "\n", + "`.cache()` はデータをディスクから読み込んだ後、データをメモリに保持します。これにより、モデルのトレーニング中にデータセットがボトルネックになることを回避できます。データセットが大きすぎてメモリに収まらない場合は、この方法を使用して、パフォーマンスの高いオンディスクキャッシュを作成することもできます。これは、多くの小さなファイルを読み込むより効率的です。\n", + "\n", + "`.prefetch()` はトレーニング中にデータの前処理とモデルの実行をオーバーラップさせます。\n", + "\n", + "以上の 2 つの方法とデータをディスクにキャッシュする方法についての詳細は、[データパフォーマンスガイド](https://www.tensorflow.org/guide/data_performance)を参照してください。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### モデルを作成する\n", + "\n", + "ニューラルネットワークを作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "これらのレイヤーは、分類器を構成するため一列に積み重ねられます。\n", + "\n", + "1. 最初のレイヤーは `Embedding` (埋め込み)レイヤーです。このレイヤーは、整数にエンコードされた語彙を受け取り、それぞれの単語インデックスに対応する埋め込みベクトルを検索します。埋め込みベクトルは、モデルのトレーニングの中で学習されます。ベクトル化のために、出力行列には次元が1つ追加されます。その結果、次元は、`(batch, sequence, embedding)` となります。埋め込みの詳細については、[単語埋め込みチュートリアル](https://www.tensorflow.org/text/guide/word_embeddings)を参照してください。\n", + "2. 次は、`GlobalAveragePooling1D`(1次元のグローバル平均プーリング)レイヤーです。このレイヤーは、それぞれのサンプルについて、シーケンスの次元方向に平均値をもとめ、固定長のベクトルを返します。この結果、モデルは最も単純な形で、可変長の入力を扱うことができるようになります。\n", + "3. 最後のレイヤーは、単一の出力ノードと密に接続されています。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 損失関数とオプティマイザ\n", + "\n", + "モデルをトレーニングするには、損失関数とオプティマイザが必要です。これは二項分類問題であり、モデルは確率(シグモイドアクティベーションを持つ単一ユニットレイヤー)を出力するため、`losses.BinaryCrossentropy` 損失関数を使用します。\n", + "\n", + "損失関数の候補はこれだけではありません。例えば、`mean_squared_error`(平均二乗誤差)を使うこともできます。しかし、一般的には、確率を扱うには`binary_crossentropy`の方が適しています。`binary_crossentropy`は、確率分布の間の「距離」を測定する尺度です。今回の場合には、真の分布と予測値の分布の間の距離ということになります。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### モデルをトレーニングする\n", + "\n", + "`dataset` オブジェクトを fit メソッドに渡すことにより、モデルをトレーニングします。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### モデルを評価する\n", + "\n", + "モデルがどのように実行するか見てみましょう。2 つの値が返されます。損失(誤差、値が低いほど良)と正確度です。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "この、かなり素朴なアプローチでも 86% 前後の正解度を達成しました。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 経時的な正解度と損失のグラフを作成する\n", + "\n", + "`model.fit()` は、トレーニング中に発生したすべての情報を詰まったディクショナリを含む `History` オブジェクトを返します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "トレーニングと検証中に監視されている各メトリックに対して 1 つずつ、計 4 つのエントリがあります。このエントリを使用して、トレーニングと検証の損失とトレーニングと検証の正解度を比較したグラフを作成することができます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "このグラフでは、点はトレーニングの損失と正解度を表し、実線は検証の損失と正解度を表します。\n", + "\n", + "トレーニングの損失がエポックごとに*下降*し、トレーニングの正解度がエポックごとに*上昇*していることに注目してください。これは、勾配下降最適化を使用しているときに見られる現象で、イテレーションごとに希望する量を最小化します。\n", + "\n", + "これは検証の損失と精度には当てはまりません。これらはトレーニング精度の前にピークに達しているようです。これが過適合の例で、モデルが、遭遇したことのないデータよりもトレーニングデータで優れたパフォーマンスを発揮する現象です。この後、モデルは過度に最適化し、テストデータに*一般化*しないトレーニングデータ*特有*の表現を学習します。\n", + "\n", + "この特定のケースでは、検証の正解度が向上しなくなったときにトレーニングを停止することにより、過適合を防ぐことができます。これを行うには、`tf.keras.callbacks.EarlyStopping` コールバックを使用することができます。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## モデルをエクスポートする\n", + "\n", + "上記のコードでは、モデルにテキストをフィードする前に、`TextVectorization` レイヤーをデータセットに適用しました。モデルで生の文字列を処理できるようにする場合 (たとえば、展開を簡素化するため)、モデル内に `TextVectorization` レイヤーを含めることができます。これを行うには、トレーニングしたばかりの重みを使用して新しいモデルを作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 新しいデータの推論\n", + "\n", + "新しい例の予測を取得するには、`model.predict()`を呼び出します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "モデル内にテキスト前処理ロジックを含めると、モデルを本番環境にエクスポートして展開を簡素化し、[トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)の可能性を減らすことができます。\n", + "\n", + "TextVectorization レイヤーを適用する場所を選択する際に性能の違いに留意する必要があります。モデルの外部で使用すると、GPU でトレーニングするときに非同期 CPU 処理とデータのバッファリングを行うことができます。したがって、GPU でモデルをトレーニングしている場合は、モデルの開発中に最高のパフォーマンスを得るためにこのオプションを使用し、デプロイの準備ができたらモデル内に TextVectorization レイヤーを含めるように切り替えることをお勧めします。\n", + "\n", + "モデルの保存の詳細については、この[チュートリアル](https://www.tensorflow.org/tutorials/keras/save_and_load)にアクセスしてください。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 演習:StackOverflow の質問に対するマルチクラス分類\n", + "\n", + "このチュートリアルでは、IMDB データセットで二項分類器を最初からトレーニングする方法を示しました。演習として、このノートブックを変更して、[Stack Overflow](http://stackoverflow.com/) のプログラミング質問のタグを予測するマルチクラス分類器をトレーニングできます。\n", + "\n", + "Stack Overflow に投稿された数千のプログラミングに関する質問(たとえば、「Python でディクショナリを値で並べ替える方法」)の本文を含む[データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)が用意されています。それぞれ、1 つのタグ(Python、CSharp、JavaScript、または Java のいずれか)でラベル付けされています。この演習では、質問を入力として受け取り、適切なタグ(この場合は Python)を予測します。\n", + "\n", + "使用するデータセットには、1,700 万件以上の投稿を含む [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) の大規模な StackOverflow パブリックデータセットから抽出された数千の質問が含まれています。\n", + "\n", + "データセットをダウンロードすると、以前に使用した IMDB データセットと同様のディレクトリ構造になっていることがわかります。\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "注意: 分類問題の難易度を上げるために、プログラミングの質問での Python、CSharp、JavaScript、または Java という単語は、*blank* という単語に置き換えられました(多くの質問には、対象の言語が含まれているため)。\n", + "\n", + "この演習を完了するには、、このノートブックを変更してStackOverflow データセットを操作する必要があります。次の変更を行います。\n", + "\n", + "1. ノートブックの上部で、IMDB データセットをダウンロードするコードを、事前に準備されている [Stack Overflow データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)をダウンロードするコードで更新します。Stack Overflow データセットは同様のディレクトリ構造を持っているため、多くの変更を加える必要はありません。\n", + "\n", + "2. 4 つの出力クラスがあるため、モデルの最後のレイヤーを `Dense(4)` に変更します。\n", + "\n", + "3. モデルをコンパイルするときは、損失を `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)` に変更します。これは、各クラスのラベルが整数である場合に、マルチクラス分類問題に使用する正しい損失関数です。(この場合、 0、*1*、*2*、または 3 のいずれかになります)。さらに、これはマルチクラス分類の問題であるため、メトリックを `metrics=['accuracy']` に変更します (tf.metrics.BinaryAccuracy はバイナリ分類器にのみ使用されます)。\n", + "\n", + "4. 経時的な精度をプロットする場合は、`binary_accuracy` および `val_binary_accuracy`をそれぞれ `accuracy` および `val_accuracy` に変更します。\n", + "\n", + "5. これらの変更が完了すると、マルチクラス分類器をトレーニングできるようになります。 " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 詳細\n", + "\n", + "このチュートリアルでは、最初からテキスト分類を実行する方法を紹介しました。一般的なテキスト分類ワークフローの詳細については、Google Developers の[テキスト分類ガイド](https://developers.google.com/machine-learning/guides/text-classification/)をご覧ください。\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/ko/tutorials/keras/text_classification.ipynb b/site/ko/tutorials/keras/text_classification.ipynb index 74b14fda01..33bbca4842 100644 --- a/site/ko/tutorials/keras/text_classification.ipynb +++ b/site/ko/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 영화 리뷰를 사용한 텍스트 분류" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
TensorFlow.org에서 보기 Google Colab에서 실행GitHub에서 소그 보기노트북 다운로드
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "이 튜토리얼은 디스크에 저장된 일반 텍스트 파일에서 시작하는 텍스트 분류를 보여줍니다. IMDB 데이터세트에 대한 감정 분석을 수행하도록 이진 분류기를 훈련합니다. 노트북의 마지막에는 스택 오버플로에서 프로그래밍 질문에 대한 태그를 예측하도록 다중 클래스 분류기를 훈련하는 연습을 시도해볼 수 있습니다.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## 감정 분석\n", - "\n", - "이 노트북은 리뷰 텍스트를 사용하여 영화 리뷰를 *긍정적* 또는 *부정적*으로 분류합니다. 중요하고 널리 적용 가능한 머신러닝 문제인 *이진* 분류의 예입니다.\n", - "\n", - "[IMDB 데이터세트](https://ai.stanford.edu/~amaas/data/sentiment/)에는 [인터넷 영화 데이터베이스](https://www.imdb.com/)에서 가져온 50,000개의 영화 리뷰 텍스트가 포함되어 있습니다. 훈련용 리뷰 25,000개와 테스트용 리뷰 25,000개로 나뉩니다. 훈련 및 테스트 세트는 *균형을 이룹니다*. 즉, 동일한 수의 긍정적인 리뷰와 부정적인 리뷰가 포함되어 있습니다.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### 데이터세트 다운로드 및 탐색하기\n", - "\n", - "데이터 세트를 다운로드하여 추출한 다음 디렉터리 구조를 살펴보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` 및 `aclImdb/train/neg` 디렉토리에는 각각 단일 영화를 리뷰한 많은 텍스트 파일이 포함되어 있습니다. 그 중 하나를 살펴보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### 데이터세트 로드하기\n", - "\n", - "다음으로, 디스크에서 데이터를 로드하고 훈련에 적합한 형식으로 준비합니다. 이를 위해 다음과 같은 디렉토리 구조를 예상하는 유용한 [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 유틸리티를 사용합니다.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "이진 분류를 위한 데이터세트를 준비하려면 디스크에 `class_a` 및 `class_b`에 해당하는 두 개의 폴더가 필요합니다. 이것들은 `aclImdb/train/pos` 및 `aclImdb/train/neg`에서 찾을 수 있는 긍정적 영화 리뷰와 부정적 영화 리뷰입니다. IMDB 데이터세트에는 추가 폴더가 포함되어 있으므로 이 유틸리티를 사용하기 전에 제거합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "다음으로 `text_dataset_from_directory` 유틸리티를 사용하여 레이블이 지정된 `tf.data.Dataset`를 만듭니다. [tf.data](https://www.tensorflow.org/guide/data)는 데이터 작업을 위한 강력한 도구 모음입니다.\n", - "\n", - "머신러닝 실험을 실행할 때 데이터세트를 [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set) 및 [test](https://developers.google.com/machine-learning/glossary#test-set)의 세 부분으로 나누는 것이 가장 좋습니다.\n", - "\n", - "IMDB 데이터세트는 이미 훈련과 테스트로 나누어져 있지만 검증 세트가 부족합니다. 아래 `validation_split` 인수를 사용하여 훈련 데이터를 80:20으로 분할하여 검증 세트를 생성해 보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "위에서 볼 수 있듯이 training 폴더에는 25,000개의 예제가 있으며 그 중 80%(또는 20,000개)를 훈련에 사용할 것입니다. 잠시 후에 알 수 있겠지만 데이터세트를 `model.fit`에 직접 전달하여 모델을 훈련할 수 있습니다. `tf.data`를 처음 사용하는 경우 데이터세트를 반복하고 다음과 같이 몇 가지 예를 출력할 수도 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "리뷰에는 `
`와 같은 간헐적 HTML 태그와 구두점을 포함한 원시 텍스트가 포함되어 있다는 점에 주목하세요. 다음 섹션에서 이를 처리하는 방법을 보여줍니다.\n", - "\n", - "레이블은 0 또는 1입니다. 이들 중 어느 것이 긍정적이고 부정적인 영화 리뷰에 해당하는지 확인하려면 데이터세트에서 `class_names` 속성을 확인할 수 있습니다.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "다음으로, 검증 및 테스트 데이터세트를 만듭니다. 검증을 위해 훈련 세트의 나머지 5,000개 리뷰를 사용합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "참고: `validation_split` 및 `subset` 인수를 사용할 때 검증 및 훈련 분할이 겹치지 않도록 임의 시드를 지정하거나 `shuffle=False`를 전달하는 것을 잊지 마세요." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### 훈련을 위한 데이터세트 준비하기\n", - "\n", - "다음으로, 유용한 `tf.keras.layers.TextVectorization` 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다.\n", - "\n", - "표준화는 일반적으로 구두점이나 HTML 요소를 제거하여 데이터세트를 단순화하기 위해 텍스트를 전처리하는 것을 말합니다. 토큰화는 문자열을 여러 토큰으로 분할하는 것을 말합니다(예: 화이트스페이스에서 분할하여 문장을 개별 단어로 분할). 벡터화는 토큰을 숫자로 변환하여 신경망에 공급될 수 있도록 하는 것을 말합니다. 이러한 모든 작업을 이 레이어에서 수행할 수 있습니다.\n", - "\n", - "위에서 볼 수 있듯이 리뷰에는 `
`와 같은 다양한 HTML 태그가 포함되어 있습니다. 이러한 태그는 `TextVectorization` 레이어의 기본 표준화 도구로 제거되지 않습니다(텍스트를 소문자로 변환하고 기본적으로 구두점을 제거하지만 HTML은 제거하지 않음). HTML을 제거하기 위해 사용자 정의 표준화 함수를 작성합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "참고: [훈련-테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(훈련-제공 왜곡이라고도 함)를 방지하려면 훈련 및 테스트 시간에 데이터를 동일하게 전처리하는 것이 중요합니다. 이를 용이하게 하기 위해 `TextVectorization` 레이어를 모델 내에 직접 포함할 수 있습니다. 본 튜토리얼에서 나중에 이 내용을 알아봅니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "다음으로 `TextVectorization` 레이어를 만듭니다. 이 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다. 각 토큰에 대해 고유한 정수 인덱스를 생성하도록 `output_mode`를 `int`로 설정합니다.\n", - "\n", - "기본 분할 함수와 위에서 정의한 사용자 지정 표준화 함수를 사용하고 있습니다. 명시적 최대값인 `sequence_length`와 같이 모델에 대한 몇 가지 상수를 정의하여 레이어가 시퀀스를 정확히 `sequence_length` 값으로 채우거나 자르도록 합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "다음으로, 전처리 레이어의 상태를 데이터세트에 맞추기 위해 `adapt`를 호출합니다. 그러면 모델이 문자열 인덱스를 정수로 빌드합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "참고: adapt를 호출할 때 훈련 데이터만 사용하는 것이 중요합니다(테스트세트를 사용하면 정보가 누출됨)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "이 레이어를 사용하여 일부 데이터를 전처리한 결과를 확인하는 함수를 만들어 보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "위에서 볼 수 있듯이 각 토큰은 정수로 대체되었습니다. 레이어에서 `.get_vocabulary()`를 호출하여 각 정수에 해당하는 토큰(문자열)을 조회할 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "모델을 훈련할 준비가 거의 되었습니다. 최종 전처리 단계로 이전에 생성한 TextVectorization 레이어를 훈련, 검증 및 테스트 데이터세트에 적용합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### 성능을 높이도록 데이터세트 구성하기\n", - "\n", - "다음은 I/O가 차단되지 않도록 데이터를 로드할 때 사용해야 하는 두 가지 중요한 메서드입니다.\n", - "\n", - "`.cache()`는 데이터가 디스크에서 로드된 후 메모리에 데이터를 보관합니다. 이렇게 하면 모델을 훈련하는 동안 데이터세트로 인해 병목 현상이 발생하지 않습니다. 데이터세트가 너무 커서 메모리에 맞지 않는 경우, 이 메서드를 사용하여 성능이 뛰어난 온 디스크 캐시를 생성할 수도 있습니다. 많은 작은 파일보다 읽기가 더 효율적입니다.\n", - "\n", - "`.prefetch()`는 훈련 중에 데이터 전처리 및 모델 실행과 겹칩니다.\n", - "\n", - "[데이터 성능 가이드](https://www.tensorflow.org/guide/data_performance)에서 두 가지 메서드와 데이터를 디스크에 캐싱하는 방법에 관해 자세히 알아볼 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### 모델 생성\n", - "\n", - "이제 신경망을 만들 차례입니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "층을 순서대로 쌓아 분류기(classifier)를 만듭니다:\n", - "\n", - "1. 첫 번째 레이어는 `Embedding` 레이어입니다. 이 레이어는 정수로 인코딩된 리뷰를 입력 받고 각 단어 인덱스에 해당하는 임베딩 벡터를 찾습니다. 이러한 벡터는 모델이 훈련되면서 학습됩니다. 이들 벡터는 출력 배열에 차원을 추가합니다. 최종 차원은 `(batch, sequence, embedding)`이 됩니다. 임베딩에 대해 보다 자세히 알아보려면 [단어 임베딩](https://www.tensorflow.org/text/guide/word_embeddings) 튜토리얼을 확인하세요.\n", - "2. 그다음 `GlobalAveragePooling1D` 층은 `sequence` 차원에 대해 평균을 계산하여 각 샘플에 대해 고정된 길이의 출력 벡터를 반환합니다. 이는 길이가 다른 입력을 다루는 가장 간단한 방법입니다.\n", - "3. 마지막 층은 하나의 출력 노드(node)를 가진 완전 연결 층입니다. `sigmoid` 활성화 함수를 사용하여 0과 1 사이의 실수를 출력합니다. 이 값은 확률 또는 신뢰도를 나타냅니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 손실 함수와 옵티마이저\n", - "\n", - "모델이 훈련하려면 손실 함수(loss function)과 옵티마이저(optimizer)가 필요합니다. 이 예제는 이진 분류 문제이고 모델이 확률을 출력하므로(출력층의 유닛이 하나이고 `sigmoid` 활성화 함수를 사용합니다), `binary_crossentropy` 손실 함수를 사용하겠습니다.\n", - "\n", - "이제 최적화 기와 손실 함수를 사용하도록 모델을 구성합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### 모델 훈련하기\n", - "\n", - "`dataset` 개체를 fit 메서드에 전달하여 모델을 훈련합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### 모델 평가하기\n", - "\n", - "모델의 성능을 확인해 보죠. 두 개의 값이 반환됩니다. 손실(오차를 나타내는 숫자이므로 낮을수록 좋습니다)과 정확도입니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "이 상당히 단순한 접근 방식은 약 86%의 정확도를 달성합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 정확도와 손실 그래프 그리기\n", - "\n", - "`model.fit()`은 훈련 중에 발생한 모든 것을 가진 사전을 포함하는 `History` 객체를 반환합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "네 개의 항목이 있습니다. 훈련과 검증 단계에서 모니터링하는 지표들입니다. 훈련 손실과 검증 손실을 그래프로 그려 보고, 훈련 정확도와 검증 정확도도 그래프로 그려서 비교해 보겠습니다:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "이 그래프에서 점선은 훈련 손실과 훈련 정확도를 나타냅니다. 실선은 검증 손실과 검증 정확도입니다.\n", - "\n", - "훈련 손실은 각 epoch마다 *감소*하고 훈련 정확성은 각 epoch마다 *증가*합니다. 경사 하강 최적화를 사용할 때 이와 같이 예상됩니다. 모든 반복에서 원하는 수량을 최소화해야 합니다.\n", - "\n", - "하지만 검증 손실과 검증 정확도에서는 그렇지 못합니다. 훈련 정확도 이전이 피크인 것 같습니다. 이는 과대적합 때문입니다. 이전에 본 적 없는 데이터보다 훈련 데이터에서 모델이 더 잘 동작합니다. 이 지점부터는 모델이 과도하게 최적화되어 테스트 데이터에서 *일반화*되지 않는 훈련 데이터의 *특정* 표현을 학습합니다.\n", - "\n", - "여기에서는 과대적합을 막기 위해 단순히 검증 정확도가 더 이상 증가하지 않는 경우에 훈련을 중단할 수 있습니다. 이를 수행하는 한 가지 방법은 `tf.keras.callbacks.EarlyStopping` 콜백을 사용하는 것입니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## 모델 내보내기\n", - "\n", - "위의 코드에서는 모델에 텍스트를 제공하기 전에 `TextVectorization` 레이어를 데이터세트에 적용했습니다. 모델이 원시 문자열을 처리할 수 있도록 하려면(예: 배포를 단순화하기 위해) 모델 내부에 `TextVectorization` 레이어를 포함할 수 있습니다. 이를 위해 방금 훈련한 가중치를 사용하여 새 모델을 만들 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 새로운 데이터로 추론하기\n", - "\n", - "새로운 예에 대한 예측을 얻으려면 간단히 `model.predict()`를 호출하면 됩니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "모델 내부에 텍스트 전처리 논리를 포함하면 배포를 단순화하고 [훈련/테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) 가능성을 줄이는 프로덕션용 모델을 내보낼 수 있습니다.\n", - "\n", - "TextVectorization 레이어를 적용할 위치를 선택할 때 염두에 두어야 할 성능 차이가 있습니다. 레이어를 모델 외부에서 사용하면 GPU에서 훈련할 때 비동기 CPU 처리 및 데이터 버퍼링을 수행할 수 있습니다. 따라서 GPU에서 모델을 훈련하는 경우 모델을 개발하는 동안 최상의 성능을 얻기 위해 이 옵션을 사용하고 배포 준비가 완료되면 모델 내부에 TextVectorization 레이어를 포함하도록 전환할 수 있습니다.\n", - "\n", - "모델 저장에 대해 자세히 알아보려면 이 [튜토리얼](https://www.tensorflow.org/tutorials/keras/save_and_load)을 방문하세요." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 연습: 스택 오버플로 질문에 대한 다중 클래스 분류\n", - "\n", - "이 튜토리얼은 IMDB 데이터세트에서 이진 분류자를 처음부터 훈련하는 방법을 보여주었습니다. 연습으로, 이 노트북을 수정하여 [스택 오버플로](http://stackoverflow.com/)에서 프로그래밍 질문의 태그를 예측하도록 다중 클래스 분류자를 훈련할 수 있습니다.\n", - "\n", - "스택 오버플로에 게시된 수천 개의 프로그래밍 질문(예: \"Python에서 값을 기준으로 사전을 정렬할 수 있는 방법은?\")의 본문이 포함된 [데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)가 준비되어 있습니다. 이들 각각은 정확히 하나의 태그(Python, CSharp, JavaScript 또는 Java)로 레이블이 지정됩니다. 여러분이 할 작업은 질문을 입력으로 받아 적절한 태그(이 경우 Python)를 예측하는 것입니다.\n", - "\n", - "작업할 데이터세트에는 1,700만 개 이상의 게시물이 포함된 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow)의 훨씬 더 큰 공개 스택 오버플로 데이터세트에서 추출한 수천 개의 질문이 포함되어 있습니다.\n", - "\n", - "데이터세트를 다운로드해 보면 이전에 작업한 IMDB 데이터세트와 유사한 디렉터리 구조를 가지고 있음을 알 수 있습니다.\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "참고: 분류 문제의 난이도를 높이기 위해 프로그래밍 질문에서 Python, CSharp, JavaScript 또는 Java라는 단어의 출현은 *blank*라는 단어로 대체되었습니다(많은 질문에 해당 언어가 포함됨).\n", - "\n", - "이 연습을 완료하려면 다음과 같이 수정하여 스택 오버플로 데이터세트와 함께 작동하도록 이 노트북을 수정해야 합니다.\n", - "\n", - "1. 노트북 상단에서, 미리 준비된 [스택 오버플로 데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)를 다운로드하는 코드로 IMDB 데이터세트를 다운로드하는 코드를 업데이트합니다. 스택 오버플로 데이터세트는 유사한 디렉터리 구조를 가지므로 많이 수정할 필요가 없습니다.\n", - "\n", - "2. 이제 4개의 출력 클래스가 있으므로 `Dense(4)`를 읽도록 모델의 마지막 레이어를 수정합니다.\n", - "\n", - "3. 모델을 컴파일할 때 손실을 `tf.keras.losses.SparseCategoricalCrossentropy`로 변경합니다. 이것은 각 클래스의 레이블이 정수일 때(이 경우 0, *1*, *2* 또는 *3*일 수 있음) 다중 클래스 분류 문제에 사용할 올바른 손실 함수입니다. 또한 이것은 다중 클래스 분류 문제이기 때문에 메트릭을 `metrics=['accuracy']`로 변경합니다(`tf.metrics.BinaryAccuracy`는 이진 분류자에만 사용됨).\n", - "\n", - "4. 시간 경과에 따른 정확도를 표시할 때 `binary_accuracy` 및 `val_binary_accuracy`를 각각 `accuracy` 및 `val_accuracy`로 변경합니다.\n", - "\n", - "5. 이러한 변경이 완료되면 다중 클래스 분류자를 훈련할 수 있습니다. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 더 알아보기\n", - "\n", - "이 튜토리얼은 텍스트 분류를 처음부터 알아보았습니다. 일반적인 텍스트 분류 워크플로에 대해 자세히 알아보려면 Google Developers의 [텍스트 분류 가이드](https://developers.google.com/machine-learning/guides/text-classification/)를 확인하세요.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 영화 리뷰를 사용한 텍스트 분류" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
TensorFlow.org에서 보기 Google Colab에서 실행GitHub에서 소그 보기노트북 다운로드
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "이 튜토리얼은 디스크에 저장된 일반 텍스트 파일에서 시작하는 텍스트 분류를 보여줍니다. IMDB 데이터세트에 대한 감정 분석을 수행하도록 이진 분류기를 훈련합니다. 노트북의 마지막에는 스택 오버플로에서 프로그래밍 질문에 대한 태그를 예측하도록 다중 클래스 분류기를 훈련하는 연습을 시도해볼 수 있습니다.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## 감정 분석\n", + "\n", + "이 노트북은 리뷰 텍스트를 사용하여 영화 리뷰를 *긍정적* 또는 *부정적*으로 분류합니다. 중요하고 널리 적용 가능한 머신러닝 문제인 *이진* 분류의 예입니다.\n", + "\n", + "[IMDB 데이터세트](https://ai.stanford.edu/~amaas/data/sentiment/)에는 [인터넷 영화 데이터베이스](https://www.imdb.com/)에서 가져온 50,000개의 영화 리뷰 텍스트가 포함되어 있습니다. 훈련용 리뷰 25,000개와 테스트용 리뷰 25,000개로 나뉩니다. 훈련 및 테스트 세트는 *균형을 이룹니다*. 즉, 동일한 수의 긍정적인 리뷰와 부정적인 리뷰가 포함되어 있습니다.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### 데이터세트 다운로드 및 탐색하기\n", + "\n", + "데이터 세트를 다운로드하여 추출한 다음 디렉터리 구조를 살펴보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` 및 `aclImdb/train/neg` 디렉토리에는 각각 단일 영화를 리뷰한 많은 텍스트 파일이 포함되어 있습니다. 그 중 하나를 살펴보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### 데이터세트 로드하기\n", + "\n", + "다음으로, 디스크에서 데이터를 로드하고 훈련에 적합한 형식으로 준비합니다. 이를 위해 다음과 같은 디렉토리 구조를 예상하는 유용한 [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 유틸리티를 사용합니다.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "이진 분류를 위한 데이터세트를 준비하려면 디스크에 `class_a` 및 `class_b`에 해당하는 두 개의 폴더가 필요합니다. 이것들은 `aclImdb/train/pos` 및 `aclImdb/train/neg`에서 찾을 수 있는 긍정적 영화 리뷰와 부정적 영화 리뷰입니다. IMDB 데이터세트에는 추가 폴더가 포함되어 있으므로 이 유틸리티를 사용하기 전에 제거합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "다음으로 `text_dataset_from_directory` 유틸리티를 사용하여 레이블이 지정된 `tf.data.Dataset`를 만듭니다. [tf.data](https://www.tensorflow.org/guide/data)는 데이터 작업을 위한 강력한 도구 모음입니다.\n", + "\n", + "머신러닝 실험을 실행할 때 데이터세트를 [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set) 및 [test](https://developers.google.com/machine-learning/glossary#test-set)의 세 부분으로 나누는 것이 가장 좋습니다.\n", + "\n", + "IMDB 데이터세트는 이미 훈련과 테스트로 나누어져 있지만 검증 세트가 부족합니다. 아래 `validation_split` 인수를 사용하여 훈련 데이터를 80:20으로 분할하여 검증 세트를 생성해 보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "위에서 볼 수 있듯이 training 폴더에는 25,000개의 예제가 있으며 그 중 80%(또는 20,000개)를 훈련에 사용할 것입니다. 잠시 후에 알 수 있겠지만 데이터세트를 `model.fit`에 직접 전달하여 모델을 훈련할 수 있습니다. `tf.data`를 처음 사용하는 경우 데이터세트를 반복하고 다음과 같이 몇 가지 예를 출력할 수도 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "리뷰에는 `
`와 같은 간헐적 HTML 태그와 구두점을 포함한 원시 텍스트가 포함되어 있다는 점에 주목하세요. 다음 섹션에서 이를 처리하는 방법을 보여줍니다.\n", + "\n", + "레이블은 0 또는 1입니다. 이들 중 어느 것이 긍정적이고 부정적인 영화 리뷰에 해당하는지 확인하려면 데이터세트에서 `class_names` 속성을 확인할 수 있습니다.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "다음으로, 검증 및 테스트 데이터세트를 만듭니다. 검증을 위해 훈련 세트의 나머지 5,000개 리뷰를 사용합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "참고: `validation_split` 및 `subset` 인수를 사용할 때 검증 및 훈련 분할이 겹치지 않도록 임의 시드를 지정하거나 `shuffle=False`를 전달하는 것을 잊지 마세요." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### 훈련을 위한 데이터세트 준비하기\n", + "\n", + "다음으로, 유용한 `tf.keras.layers.TextVectorization` 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다.\n", + "\n", + "표준화는 일반적으로 구두점이나 HTML 요소를 제거하여 데이터세트를 단순화하기 위해 텍스트를 전처리하는 것을 말합니다. 토큰화는 문자열을 여러 토큰으로 분할하는 것을 말합니다(예: 화이트스페이스에서 분할하여 문장을 개별 단어로 분할). 벡터화는 토큰을 숫자로 변환하여 신경망에 공급될 수 있도록 하는 것을 말합니다. 이러한 모든 작업을 이 레이어에서 수행할 수 있습니다.\n", + "\n", + "위에서 볼 수 있듯이 리뷰에는 `
`와 같은 다양한 HTML 태그가 포함되어 있습니다. 이러한 태그는 `TextVectorization` 레이어의 기본 표준화 도구로 제거되지 않습니다(텍스트를 소문자로 변환하고 기본적으로 구두점을 제거하지만 HTML은 제거하지 않음). HTML을 제거하기 위해 사용자 정의 표준화 함수를 작성합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "참고: [훈련-테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(훈련-제공 왜곡이라고도 함)를 방지하려면 훈련 및 테스트 시간에 데이터를 동일하게 전처리하는 것이 중요합니다. 이를 용이하게 하기 위해 `TextVectorization` 레이어를 모델 내에 직접 포함할 수 있습니다. 본 튜토리얼에서 나중에 이 내용을 알아봅니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "다음으로 `TextVectorization` 레이어를 만듭니다. 이 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다. 각 토큰에 대해 고유한 정수 인덱스를 생성하도록 `output_mode`를 `int`로 설정합니다.\n", + "\n", + "기본 분할 함수와 위에서 정의한 사용자 지정 표준화 함수를 사용하고 있습니다. 명시적 최대값인 `sequence_length`와 같이 모델에 대한 몇 가지 상수를 정의하여 레이어가 시퀀스를 정확히 `sequence_length` 값으로 채우거나 자르도록 합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "다음으로, 전처리 레이어의 상태를 데이터세트에 맞추기 위해 `adapt`를 호출합니다. 그러면 모델이 문자열 인덱스를 정수로 빌드합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "참고: adapt를 호출할 때 훈련 데이터만 사용하는 것이 중요합니다(테스트세트를 사용하면 정보가 누출됨)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "이 레이어를 사용하여 일부 데이터를 전처리한 결과를 확인하는 함수를 만들어 보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "위에서 볼 수 있듯이 각 토큰은 정수로 대체되었습니다. 레이어에서 `.get_vocabulary()`를 호출하여 각 정수에 해당하는 토큰(문자열)을 조회할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "모델을 훈련할 준비가 거의 되었습니다. 최종 전처리 단계로 이전에 생성한 TextVectorization 레이어를 훈련, 검증 및 테스트 데이터세트에 적용합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### 성능을 높이도록 데이터세트 구성하기\n", + "\n", + "다음은 I/O가 차단되지 않도록 데이터를 로드할 때 사용해야 하는 두 가지 중요한 메서드입니다.\n", + "\n", + "`.cache()`는 데이터가 디스크에서 로드된 후 메모리에 데이터를 보관합니다. 이렇게 하면 모델을 훈련하는 동안 데이터세트로 인해 병목 현상이 발생하지 않습니다. 데이터세트가 너무 커서 메모리에 맞지 않는 경우, 이 메서드를 사용하여 성능이 뛰어난 온 디스크 캐시를 생성할 수도 있습니다. 많은 작은 파일보다 읽기가 더 효율적입니다.\n", + "\n", + "`.prefetch()`는 훈련 중에 데이터 전처리 및 모델 실행과 겹칩니다.\n", + "\n", + "[데이터 성능 가이드](https://www.tensorflow.org/guide/data_performance)에서 두 가지 메서드와 데이터를 디스크에 캐싱하는 방법에 관해 자세히 알아볼 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### 모델 생성\n", + "\n", + "이제 신경망을 만들 차례입니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "층을 순서대로 쌓아 분류기(classifier)를 만듭니다:\n", + "\n", + "1. 첫 번째 레이어는 `Embedding` 레이어입니다. 이 레이어는 정수로 인코딩된 리뷰를 입력 받고 각 단어 인덱스에 해당하는 임베딩 벡터를 찾습니다. 이러한 벡터는 모델이 훈련되면서 학습됩니다. 이들 벡터는 출력 배열에 차원을 추가합니다. 최종 차원은 `(batch, sequence, embedding)`이 됩니다. 임베딩에 대해 보다 자세히 알아보려면 [단어 임베딩](https://www.tensorflow.org/text/guide/word_embeddings) 튜토리얼을 확인하세요.\n", + "2. 그다음 `GlobalAveragePooling1D` 층은 `sequence` 차원에 대해 평균을 계산하여 각 샘플에 대해 고정된 길이의 출력 벡터를 반환합니다. 이는 길이가 다른 입력을 다루는 가장 간단한 방법입니다.\n", + "3. 마지막 층은 하나의 출력 노드(node)를 가진 완전 연결 층입니다. `sigmoid` 활성화 함수를 사용하여 0과 1 사이의 실수를 출력합니다. 이 값은 확률 또는 신뢰도를 나타냅니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 손실 함수와 옵티마이저\n", + "\n", + "모델이 훈련하려면 손실 함수(loss function)과 옵티마이저(optimizer)가 필요합니다. 이 예제는 이진 분류 문제이고 모델이 확률을 출력하므로(출력층의 유닛이 하나이고 `sigmoid` 활성화 함수를 사용합니다), `binary_crossentropy` 손실 함수를 사용하겠습니다.\n", + "\n", + "이제 최적화 기와 손실 함수를 사용하도록 모델을 구성합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### 모델 훈련하기\n", + "\n", + "`dataset` 개체를 fit 메서드에 전달하여 모델을 훈련합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### 모델 평가하기\n", + "\n", + "모델의 성능을 확인해 보죠. 두 개의 값이 반환됩니다. 손실(오차를 나타내는 숫자이므로 낮을수록 좋습니다)과 정확도입니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "이 상당히 단순한 접근 방식은 약 86%의 정확도를 달성합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 정확도와 손실 그래프 그리기\n", + "\n", + "`model.fit()`은 훈련 중에 발생한 모든 것을 가진 사전을 포함하는 `History` 객체를 반환합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "네 개의 항목이 있습니다. 훈련과 검증 단계에서 모니터링하는 지표들입니다. 훈련 손실과 검증 손실을 그래프로 그려 보고, 훈련 정확도와 검증 정확도도 그래프로 그려서 비교해 보겠습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "이 그래프에서 점선은 훈련 손실과 훈련 정확도를 나타냅니다. 실선은 검증 손실과 검증 정확도입니다.\n", + "\n", + "훈련 손실은 각 epoch마다 *감소*하고 훈련 정확성은 각 epoch마다 *증가*합니다. 경사 하강 최적화를 사용할 때 이와 같이 예상됩니다. 모든 반복에서 원하는 수량을 최소화해야 합니다.\n", + "\n", + "하지만 검증 손실과 검증 정확도에서는 그렇지 못합니다. 훈련 정확도 이전이 피크인 것 같습니다. 이는 과대적합 때문입니다. 이전에 본 적 없는 데이터보다 훈련 데이터에서 모델이 더 잘 동작합니다. 이 지점부터는 모델이 과도하게 최적화되어 테스트 데이터에서 *일반화*되지 않는 훈련 데이터의 *특정* 표현을 학습합니다.\n", + "\n", + "여기에서는 과대적합을 막기 위해 단순히 검증 정확도가 더 이상 증가하지 않는 경우에 훈련을 중단할 수 있습니다. 이를 수행하는 한 가지 방법은 `tf.keras.callbacks.EarlyStopping` 콜백을 사용하는 것입니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## 모델 내보내기\n", + "\n", + "위의 코드에서는 모델에 텍스트를 제공하기 전에 `TextVectorization` 레이어를 데이터세트에 적용했습니다. 모델이 원시 문자열을 처리할 수 있도록 하려면(예: 배포를 단순화하기 위해) 모델 내부에 `TextVectorization` 레이어를 포함할 수 있습니다. 이를 위해 방금 훈련한 가중치를 사용하여 새 모델을 만들 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 새로운 데이터로 추론하기\n", + "\n", + "새로운 예에 대한 예측을 얻으려면 간단히 `model.predict()`를 호출하면 됩니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "모델 내부에 텍스트 전처리 논리를 포함하면 배포를 단순화하고 [훈련/테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) 가능성을 줄이는 프로덕션용 모델을 내보낼 수 있습니다.\n", + "\n", + "TextVectorization 레이어를 적용할 위치를 선택할 때 염두에 두어야 할 성능 차이가 있습니다. 레이어를 모델 외부에서 사용하면 GPU에서 훈련할 때 비동기 CPU 처리 및 데이터 버퍼링을 수행할 수 있습니다. 따라서 GPU에서 모델을 훈련하는 경우 모델을 개발하는 동안 최상의 성능을 얻기 위해 이 옵션을 사용하고 배포 준비가 완료되면 모델 내부에 TextVectorization 레이어를 포함하도록 전환할 수 있습니다.\n", + "\n", + "모델 저장에 대해 자세히 알아보려면 이 [튜토리얼](https://www.tensorflow.org/tutorials/keras/save_and_load)을 방문하세요." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 연습: 스택 오버플로 질문에 대한 다중 클래스 분류\n", + "\n", + "이 튜토리얼은 IMDB 데이터세트에서 이진 분류자를 처음부터 훈련하는 방법을 보여주었습니다. 연습으로, 이 노트북을 수정하여 [스택 오버플로](http://stackoverflow.com/)에서 프로그래밍 질문의 태그를 예측하도록 다중 클래스 분류자를 훈련할 수 있습니다.\n", + "\n", + "스택 오버플로에 게시된 수천 개의 프로그래밍 질문(예: \"Python에서 값을 기준으로 사전을 정렬할 수 있는 방법은?\")의 본문이 포함된 [데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)가 준비되어 있습니다. 이들 각각은 정확히 하나의 태그(Python, CSharp, JavaScript 또는 Java)로 레이블이 지정됩니다. 여러분이 할 작업은 질문을 입력으로 받아 적절한 태그(이 경우 Python)를 예측하는 것입니다.\n", + "\n", + "작업할 데이터세트에는 1,700만 개 이상의 게시물이 포함된 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow)의 훨씬 더 큰 공개 스택 오버플로 데이터세트에서 추출한 수천 개의 질문이 포함되어 있습니다.\n", + "\n", + "데이터세트를 다운로드해 보면 이전에 작업한 IMDB 데이터세트와 유사한 디렉터리 구조를 가지고 있음을 알 수 있습니다.\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "참고: 분류 문제의 난이도를 높이기 위해 프로그래밍 질문에서 Python, CSharp, JavaScript 또는 Java라는 단어의 출현은 *blank*라는 단어로 대체되었습니다(많은 질문에 해당 언어가 포함됨).\n", + "\n", + "이 연습을 완료하려면 다음과 같이 수정하여 스택 오버플로 데이터세트와 함께 작동하도록 이 노트북을 수정해야 합니다.\n", + "\n", + "1. 노트북 상단에서, 미리 준비된 [스택 오버플로 데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)를 다운로드하는 코드로 IMDB 데이터세트를 다운로드하는 코드를 업데이트합니다. 스택 오버플로 데이터세트는 유사한 디렉터리 구조를 가지므로 많이 수정할 필요가 없습니다.\n", + "\n", + "2. 이제 4개의 출력 클래스가 있으므로 `Dense(4)`를 읽도록 모델의 마지막 레이어를 수정합니다.\n", + "\n", + "3. 모델을 컴파일할 때 손실을 `tf.keras.losses.SparseCategoricalCrossentropy`로 변경합니다. 이것은 각 클래스의 레이블이 정수일 때(이 경우 0, *1*, *2* 또는 *3*일 수 있음) 다중 클래스 분류 문제에 사용할 올바른 손실 함수입니다. 또한 이것은 다중 클래스 분류 문제이기 때문에 메트릭을 `metrics=['accuracy']`로 변경합니다(`tf.metrics.BinaryAccuracy`는 이진 분류자에만 사용됨).\n", + "\n", + "4. 시간 경과에 따른 정확도를 표시할 때 `binary_accuracy` 및 `val_binary_accuracy`를 각각 `accuracy` 및 `val_accuracy`로 변경합니다.\n", + "\n", + "5. 이러한 변경이 완료되면 다중 클래스 분류자를 훈련할 수 있습니다. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 더 알아보기\n", + "\n", + "이 튜토리얼은 텍스트 분류를 처음부터 알아보았습니다. 일반적인 텍스트 분류 워크플로에 대해 자세히 알아보려면 Google Developers의 [텍스트 분류 가이드](https://developers.google.com/machine-learning/guides/text-classification/)를 확인하세요.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/pt-br/tutorials/keras/text_classification.ipynb b/site/pt-br/tutorials/keras/text_classification.ipynb index 259fe0d015..35336e943b 100644 --- a/site/pt-br/tutorials/keras/text_classification.ipynb +++ b/site/pt-br/tutorials/keras/text_classification.ipynb @@ -1,978 +1,978 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Classificação de texto" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
Ver em TensorFlow.org\n", - " Executar no Google Colab\n", - " Ver fonte no GitHub\n", - " Baixar notebook\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "Este tutorial demonstra a classificação de texto, começando pela classificação de arquivos de texto sem formatação armazenados no disco. Você treinará um classificador binário para fazer análise de sentimento para um dataset do IMDB. No final do notebook, você poderá fazer um exercício, em que treinará um classificador multiclasse para prever a tag de uma pergunta de programação no Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Análise de sentimento\n", - "\n", - "Este notebook treina um modelo de análise de sentimento para classificar avaliações de filmes como *positivas* ou *negativas*, com base no texto da avaliação. Este é um exemplo de classificação *binária*, ou de duas classes, um tipo de problema de aprendizado de máquina importante, com diversas aplicações.\n", - "\n", - "Você usará o [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/), que contém o texto de 50 mil avaliações de filmes do [Internet Movie Database](https://www.imdb.com/). Elas são divididas em 25 mil avaliações para treinamento e 25 mil para teste. Os conjuntos de treinamento e teste são *equilibrados*, ou seja, contêm a mesma quantidade de avaliações positivas e negativas.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Baixe e explore o dataset do IMDB\n", - "\n", - "Vamos baixar e extrair o dataset, depois vamos explorar a estrutura de diretórios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "Os diretórios `aclImdb/train/pos` e `aclImdb/train/neg` contêm diversos arquivos de texto, sendo que cada um é uma única avaliação de filme. Vamos dar uma olhada em um desses arquivos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Carregue o dataset\n", - "\n", - "Agora, você vai carregar os dados para fora do disco e colocá-los em um formato adequado para o treinamento. Para isso, você usará um utilitário muito útil, o [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera uma estrutura de diretórios, como mostrado abaixo.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "Para preparar um dataset para fazer classificação binária, você precisa de duas pastas no disco, correspondentes a `class_a` e `class_b`. Elas conterão avaliações positivas e negativas de filmes, que podem ser encontradas em `aclImdb/train/pos` e `aclImdb/train/neg`. Como o dataset do IMDB contém pastas adicionais, você vai removê-las antes de usar o utilitário." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Agora, você usará o utilitário `text_dataset_from_directory` para criar um `tf.data.Dataset` com rótulos. [tf.data](https://www.tensorflow.org/guide/data) é uma coleção de ferramentas avançadas para trabalhar com dados.\n", - "\n", - "Ao realizar um experimento de aprendizado de máquina, é uma prática recomendada dividir o dataset em três: [treinamento](https://developers.google.com/machine-learning/glossary#training_set), [validação](https://developers.google.com/machine-learning/glossary#validation_set) e [teste](https://developers.google.com/machine-learning/glossary#test-set).\n", - "\n", - "O dataset do IMDB já foi dividido em conjuntos de treinamento e teste, mas ainda falta um de validação. Vamos criar um conjunto de validação utilizando uma divisão 80/20 para os dados do treinamento por meio do argumento `validation_split` abaixo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "Como podemos ver acima, há 25 mil exemplos na pasta de treinamento, das quais serão usadas 80%, ou 20 mil, para treinamento. Como veremos em breve, você pode treinar um modelo passando um dataset diretamente para `model.fit`. Se você ainda estiver aprendendo sobre `tf.data`, também pode fazer a iteração do dataset e exibir alguns exemplos, conforme mostrado abaixo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Observe que a avaliação contém texto bruto (com pontuações e tags HTML, como `
`). Você verá como lidar com isso na próxima seção.\n", - "\n", - "Os rótulos são 0 e 1. Para ver qual deles corresponde a avaliações positivas ou negativas de filmes, confira a propriedade `class_names` do dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "Em seguida, você criará um dataset de validação e de teste. Você usará as 5 mil avaliações restantes do conjunto de treinamento para a validação." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Observação: ao usar os argumentos `validation_split` e `subset`, especifique uma semente aleatória ou passe `shuffle=False` para que as divisões de validação e treinamento não se sobreponham." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Prepare o dataset para treinamento\n", - "\n", - "Em seguida, você vai padronizar, tokenizar e vetorizar os dados usando a camada `tf.keras.layers.TextVectorization`.\n", - "\n", - "Padronização refere-se ao pré-processamento do texto, tipicamente para remover pontuações ou elementos HTML a fim de simplificar o dataset. Tokenização refere-se à divisão das strings em tokens (por exemplo, dividir uma frase em palavras individuais, fazendo a divisão a cada espaço). Vetorização refere-se à conversão de tokens em números para que eles possam ser alimentados em uma rede neural. Todas essas tarefas podem ser feitas com essa camada.\n", - "\n", - "Como visto acima, as avaliações contêm diversas tags HTML, como `
`. Elas não serão removidas pelo padronizador padrão na camada `TextVectorization` (que converte texto em letras minúsculas e remove as pontuações por padrão, mas não retira código HTML). Você escreverá uma função de padronização personalizada para remover código HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Observação: para evitar o [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (também conhecido como desvio de treinamento/serviço), é importante pré-processar os dados de forma idêntica no momento de treinamento e teste. Para isso, a camada `TextVectorization` pode ser incluída diretamente dentro do modelo, conforme exibido posteriormente neste tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Em seguida, você criará uma camada `TextVectorization`, que será usada para padronizar, tokenizar e vetorizar os dados. Você deve definir `output_mode` como `int` para criar índices de inteiros únicos para cada token.\n", - "\n", - "Observe que você está utilizando a função de divisão padrão e a função de padronização personalizada definida acima. Você também definirá algumas constantes para o modelo, como um mínimo explícito `sequence_length`, que fará a camada preencher ou truncar sequências para valores exatamente iguais a `sequence_length`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "Em seguida, chame `adapt` para adequar o estado da camada de pré-processamento ao dataset. Isso fará com que o modelo crie um índice de strings para os números inteiros." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Observação: é importante usar somente os dados de treinamento ao chamar adapt, já que o uso do dataset de teste vazaria informações." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Vamos criar uma função para ver o resultado ao usar esta camada para pré-processar alguns dados." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "Conforme visto acima, cada token foi substituído por um inteiro. Para visualizar o token (string) ao qual cada inteiro corresponde, você pode chamar `.get_vocabulary()` na camada." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "Está quase tudo pronto para treinar o modelo. Como etapa final de pré-processamento, você aplicará a camada TextVectorization criada anteriormente aos datasets de treinamento, validação e teste." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configure o dataset para melhor desempenho\n", - "\n", - "Há dois métodos importantes que você deve usar ao carregar os dados para garantir que a I/O não seja bloqueada.\n", - "\n", - "`.cache` mantém os dados na memória após o carregamento fora do disco. Isso garante que o dataset não se torne um gargalo ao treinar seu modelo. Se o dataset for muito grande para a memória, você também pode usar esse método para criar um cache no disco eficaz, que tem uma leitura mais eficiente do que vários arquivos pequenos.\n", - "\n", - "`/prefetch` sobrepõe o pré-processamento de dados e a execução do modelo durante o treinamento.\n", - "\n", - "Saiba mais sobre ambos os métodos, além de como armazenar os dados em cache no disco, no [guia sobre desempenho dos dados](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Crie o modelo\n", - "\n", - "Chegou a hora de criar sua rede neural:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "As camadas são empilhadas sequencialmente para construir o classificador:\n", - "\n", - "1. A primeira é uma camada `Embedding`, que recebe avaliações codificadas em inteiros e avalia um vetor de embedding para cada palavra-índice. Esses vetores são aprendidos à medida que o modelo é treinado. Os vetores acrescentam uma dimensão à matriz de saída. As dimensões resultantes são: `(batch, sequence, embedding)` (lote, sequência, embedding). Para saber mais sobre embeddings, confira o tutorial [Embeddings de palavras](https://www.tensorflow.org/text/guide/word_embeddings).\n", - "2. A segunda camada é `GlobalAveragePooling1D`, que retorna um vetor de saída de tamanho fixo para cada exemplo, calculando a média da dimensão de sequência. Dessa forma, o modelo consegue lidar com entradas de tamanho variável da forma mais simples possível.\n", - "3. A última camada é densamente conectada com um único nó de saída." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Função de perda e otimizador\n", - "\n", - "Todo modelo precisa de uma função de perda e um otimizador para o treinamento. Como este é um problema de classificação binária e o modelo gera como saída uma probabilidade (uma camada de unidade única com uma ativação sigmóide), você usará a função de perda `losses.BinaryCrossentropy`.\n", - "\n", - "Agora, configure o modelo para usar um otimizador e uma função de perda:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Treine o modelo\n", - "\n", - "Você passará o objeto `dataset` ao método fit para treinar o modelo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Avalie o modelo\n", - "\n", - "Vamos conferir o desempenho do modelo. Serão retornados dois valores: perda (um número que representa o erro; quanto menor, melhor) e exatidão." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "Essa estratégia bem simples atinge uma exatidão de cerca de 86%." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Crie um gráfico de exatidão e perda ao longo do tempo\n", - "\n", - "`model.fit()` retorna um objeto `History` que contém um dicionário com tudo o que aconteceu durante o treinamento:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "Há quatro entradas: uma para cada métrica monitorada durante o treinamento e a validação. Você usará esses valores para plotar a perda do treinamento e da validação para fins comparativos, além da exatidão do treinamento e da validação:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "Neste gráfico, os pontos representam a perda e exatidão do treinamento, enquanto as linhas sólidas representam a perda e exatidão da validação.\n", - "\n", - "Observe que a perda do treinamento *diminui* a cada época, e a exatidão do treinamento *aumenta* a cada época. Isso é o esperado ao usar uma otimização do método do gradiente descendente, que deve minimizar a quantidade desejada em cada iteração.\n", - "\n", - "Esse não é o caso para a perda e exatidão de validação, que parecem atingir o pico antes da exatidão do treinamento. Este é um exemplo de overfitting: o modelo tem desempenho melhor com os dados de treinamento em comparação a dados nunca vistos antes. Após esse ponto, o modelo sofre uma sobreotimização e aprende representações *específicas* dos dados de treinamento que não oferecem boas *generalizações* para os dados de teste.\n", - "\n", - "Para este caso específico, é possível evitar o overfitting simplesmente parando o treinamento quando a exatidão da validação deixa de aumentar. Uma forma de fazer isso é usando o callback `tf.keras.callbacks.EarlyStopping`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Exporte o modelo\n", - "\n", - "No código acima, você aplicou a camada `TextVectorization` ao dataset antes de alimentar o modelo com texto. Se quiser tornar o modelo capaz de processar strings brutas (por exemplo, para simplificar a implantação), é possível incluir a camada `TextVectorization` dentro do modelo. Para isso, você pode criar um novo modelo usando os pesos que acabou de treinar." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inferência de dados novos\n", - "\n", - "Para fazer previsões para novos exemplos, basta chamar `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Ao incluir a lógica de pré-processamento de texto dentro do modelo, você pode exportar um modelo para produção que simplifica a implantação e reduz o potencial de [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "Há uma diferença de desempenho que você deve considerar ao escolher onde aplicar a camada TextVectorization. Ao usá-la fora do modelo, você pode fazer o processamento assíncrono na CPU e armazenar os dados em buffer ao treinar na GPU. Portanto, se você estiver treinando seu modelo na GPU, deve escolher essa opção para obter o melhor desempenho ao desenvolver o modelo. Depois, quando você estiver pronto para preparar a implantação, inclua a camada TextVectorization dentro do modelo.\n", - "\n", - "Confira este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para saber mais sobre como salvar modelos." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Exercício: classificação multiclasse para perguntas do Stack Overflow\n", - "\n", - "Este tutorial mostrou como treinar um classificador binário do zero usando o dataset do IMDB. Você pode fazer um exercício: modifique este notebook para treinar um classificador multiclasse que preveja a tag de uma pergunta de programação feita no [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "Um [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) foi preparado para uso, contendo o texto de milhares de perguntas de programação (por exemplo, \"Como posso ordenar um dicionário por valor no Python?\") publicadas no Stack Overflow. Cada pergunta é rotulada com exatamente uma tag (Python, CSharp, JavaScript ou Java). Sua tarefa é receber uma pergunta como entrada e prever a tag apropriada, que, neste caso, é Python.\n", - "\n", - "Você usará um dataset que contém milhares de perguntas extraídas do dataset público do Stack Overflow, que é bem maior, no [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), contendo mais de 17 milhões de publicações.\n", - "\n", - "Após baixar o dataset, você verá que ele tem uma estrutura de diretórios similar ao dataset do IMDB utilizado anteriormente:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Observação: para aumentar a dificuldade do problema de classificação, as ocorrências das palavras Python, CSharp, JavaScript e Java nas perguntas de programação foram substituídas pela palavra *blank* (em branco), já que diversas perguntas contêm a linguagem de programação em questão.\n", - "\n", - "Para fazer este exercício, você deve modificar este notebook para que funcione com o dataset do Stack Overflow das seguintes maneiras:\n", - "\n", - "1. Na parte superior do notebook, atualize o código que baixa o dataset do IMDB com o código que baixa o [dataset do Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz), que já foi preparado. Como o dataset do Stack Overflow tem uma estrutura de diretórios parecida, você não precisará fazer muitas modificações.\n", - "\n", - "2. Modifique a última camada do modelo para `Dense(4)`, pois agora há quatro classes de saída.\n", - "\n", - "3. Ao compilar o modelo, altere a perda para `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta é a função de perda correta a ser usada para problemas de classificação muilticlasse, quando os rótulos de cada classe são inteiros (neste caso, podem ser 0, *1*, *2* ou *3*). Além disso, altere as métricas para `metrics=['accuracy']`, já que este é um problema de classificação multicasse (`tf.metrics.BinaryAccuracy` é usado somente para classificadores binários).\n", - "\n", - "4. Ao plotar a precisão ao longo do tempo, altere `binary_accuracy` e `val_binary_accuracy` para `accuracy` e `val_accuracy`, respectivamente.\n", - "\n", - "5. Após fazer essas alterações, você poderá treinar um classificador multiclasse. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Saiba mais\n", - "\n", - "Este tutorial mostrou como fazer a classificação de texto do zero. Para saber mais sobre o workflow de classificação de texto de forma geral, confira o [guia Classificação de texto](https://developers.google.com/machine-learning/guides/text-classification/) no Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Classificação de texto" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
Ver em TensorFlow.org\n", + " Executar no Google Colab\n", + " Ver fonte no GitHub\n", + " Baixar notebook\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "Este tutorial demonstra a classificação de texto, começando pela classificação de arquivos de texto sem formatação armazenados no disco. Você treinará um classificador binário para fazer análise de sentimento para um dataset do IMDB. No final do notebook, você poderá fazer um exercício, em que treinará um classificador multiclasse para prever a tag de uma pergunta de programação no Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Análise de sentimento\n", + "\n", + "Este notebook treina um modelo de análise de sentimento para classificar avaliações de filmes como *positivas* ou *negativas*, com base no texto da avaliação. Este é um exemplo de classificação *binária*, ou de duas classes, um tipo de problema de aprendizado de máquina importante, com diversas aplicações.\n", + "\n", + "Você usará o [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/), que contém o texto de 50 mil avaliações de filmes do [Internet Movie Database](https://www.imdb.com/). Elas são divididas em 25 mil avaliações para treinamento e 25 mil para teste. Os conjuntos de treinamento e teste são *equilibrados*, ou seja, contêm a mesma quantidade de avaliações positivas e negativas.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Baixe e explore o dataset do IMDB\n", + "\n", + "Vamos baixar e extrair o dataset, depois vamos explorar a estrutura de diretórios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "Os diretórios `aclImdb/train/pos` e `aclImdb/train/neg` contêm diversos arquivos de texto, sendo que cada um é uma única avaliação de filme. Vamos dar uma olhada em um desses arquivos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Carregue o dataset\n", + "\n", + "Agora, você vai carregar os dados para fora do disco e colocá-los em um formato adequado para o treinamento. Para isso, você usará um utilitário muito útil, o [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera uma estrutura de diretórios, como mostrado abaixo.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "Para preparar um dataset para fazer classificação binária, você precisa de duas pastas no disco, correspondentes a `class_a` e `class_b`. Elas conterão avaliações positivas e negativas de filmes, que podem ser encontradas em `aclImdb/train/pos` e `aclImdb/train/neg`. Como o dataset do IMDB contém pastas adicionais, você vai removê-las antes de usar o utilitário." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Agora, você usará o utilitário `text_dataset_from_directory` para criar um `tf.data.Dataset` com rótulos. [tf.data](https://www.tensorflow.org/guide/data) é uma coleção de ferramentas avançadas para trabalhar com dados.\n", + "\n", + "Ao realizar um experimento de aprendizado de máquina, é uma prática recomendada dividir o dataset em três: [treinamento](https://developers.google.com/machine-learning/glossary#training_set), [validação](https://developers.google.com/machine-learning/glossary#validation_set) e [teste](https://developers.google.com/machine-learning/glossary#test-set).\n", + "\n", + "O dataset do IMDB já foi dividido em conjuntos de treinamento e teste, mas ainda falta um de validação. Vamos criar um conjunto de validação utilizando uma divisão 80/20 para os dados do treinamento por meio do argumento `validation_split` abaixo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "Como podemos ver acima, há 25 mil exemplos na pasta de treinamento, das quais serão usadas 80%, ou 20 mil, para treinamento. Como veremos em breve, você pode treinar um modelo passando um dataset diretamente para `model.fit`. Se você ainda estiver aprendendo sobre `tf.data`, também pode fazer a iteração do dataset e exibir alguns exemplos, conforme mostrado abaixo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Observe que a avaliação contém texto bruto (com pontuações e tags HTML, como `
`). Você verá como lidar com isso na próxima seção.\n", + "\n", + "Os rótulos são 0 e 1. Para ver qual deles corresponde a avaliações positivas ou negativas de filmes, confira a propriedade `class_names` do dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "Em seguida, você criará um dataset de validação e de teste. Você usará as 5 mil avaliações restantes do conjunto de treinamento para a validação." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Observação: ao usar os argumentos `validation_split` e `subset`, especifique uma semente aleatória ou passe `shuffle=False` para que as divisões de validação e treinamento não se sobreponham." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Prepare o dataset para treinamento\n", + "\n", + "Em seguida, você vai padronizar, tokenizar e vetorizar os dados usando a camada `tf.keras.layers.TextVectorization`.\n", + "\n", + "Padronização refere-se ao pré-processamento do texto, tipicamente para remover pontuações ou elementos HTML a fim de simplificar o dataset. Tokenização refere-se à divisão das strings em tokens (por exemplo, dividir uma frase em palavras individuais, fazendo a divisão a cada espaço). Vetorização refere-se à conversão de tokens em números para que eles possam ser alimentados em uma rede neural. Todas essas tarefas podem ser feitas com essa camada.\n", + "\n", + "Como visto acima, as avaliações contêm diversas tags HTML, como `
`. Elas não serão removidas pelo padronizador padrão na camada `TextVectorization` (que converte texto em letras minúsculas e remove as pontuações por padrão, mas não retira código HTML). Você escreverá uma função de padronização personalizada para remover código HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Observação: para evitar o [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (também conhecido como desvio de treinamento/serviço), é importante pré-processar os dados de forma idêntica no momento de treinamento e teste. Para isso, a camada `TextVectorization` pode ser incluída diretamente dentro do modelo, conforme exibido posteriormente neste tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Em seguida, você criará uma camada `TextVectorization`, que será usada para padronizar, tokenizar e vetorizar os dados. Você deve definir `output_mode` como `int` para criar índices de inteiros únicos para cada token.\n", + "\n", + "Observe que você está utilizando a função de divisão padrão e a função de padronização personalizada definida acima. Você também definirá algumas constantes para o modelo, como um mínimo explícito `sequence_length`, que fará a camada preencher ou truncar sequências para valores exatamente iguais a `sequence_length`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "Em seguida, chame `adapt` para adequar o estado da camada de pré-processamento ao dataset. Isso fará com que o modelo crie um índice de strings para os números inteiros." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Observação: é importante usar somente os dados de treinamento ao chamar adapt, já que o uso do dataset de teste vazaria informações." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Vamos criar uma função para ver o resultado ao usar esta camada para pré-processar alguns dados." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "Conforme visto acima, cada token foi substituído por um inteiro. Para visualizar o token (string) ao qual cada inteiro corresponde, você pode chamar `.get_vocabulary()` na camada." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "Está quase tudo pronto para treinar o modelo. Como etapa final de pré-processamento, você aplicará a camada TextVectorization criada anteriormente aos datasets de treinamento, validação e teste." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configure o dataset para melhor desempenho\n", + "\n", + "Há dois métodos importantes que você deve usar ao carregar os dados para garantir que a I/O não seja bloqueada.\n", + "\n", + "`.cache` mantém os dados na memória após o carregamento fora do disco. Isso garante que o dataset não se torne um gargalo ao treinar seu modelo. Se o dataset for muito grande para a memória, você também pode usar esse método para criar um cache no disco eficaz, que tem uma leitura mais eficiente do que vários arquivos pequenos.\n", + "\n", + "`/prefetch` sobrepõe o pré-processamento de dados e a execução do modelo durante o treinamento.\n", + "\n", + "Saiba mais sobre ambos os métodos, além de como armazenar os dados em cache no disco, no [guia sobre desempenho dos dados](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Crie o modelo\n", + "\n", + "Chegou a hora de criar sua rede neural:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "As camadas são empilhadas sequencialmente para construir o classificador:\n", + "\n", + "1. A primeira é uma camada `Embedding`, que recebe avaliações codificadas em inteiros e avalia um vetor de embedding para cada palavra-índice. Esses vetores são aprendidos à medida que o modelo é treinado. Os vetores acrescentam uma dimensão à matriz de saída. As dimensões resultantes são: `(batch, sequence, embedding)` (lote, sequência, embedding). Para saber mais sobre embeddings, confira o tutorial [Embeddings de palavras](https://www.tensorflow.org/text/guide/word_embeddings).\n", + "2. A segunda camada é `GlobalAveragePooling1D`, que retorna um vetor de saída de tamanho fixo para cada exemplo, calculando a média da dimensão de sequência. Dessa forma, o modelo consegue lidar com entradas de tamanho variável da forma mais simples possível.\n", + "3. A última camada é densamente conectada com um único nó de saída." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Função de perda e otimizador\n", + "\n", + "Todo modelo precisa de uma função de perda e um otimizador para o treinamento. Como este é um problema de classificação binária e o modelo gera como saída uma probabilidade (uma camada de unidade única com uma ativação sigmóide), você usará a função de perda `losses.BinaryCrossentropy`.\n", + "\n", + "Agora, configure o modelo para usar um otimizador e uma função de perda:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Treine o modelo\n", + "\n", + "Você passará o objeto `dataset` ao método fit para treinar o modelo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Avalie o modelo\n", + "\n", + "Vamos conferir o desempenho do modelo. Serão retornados dois valores: perda (um número que representa o erro; quanto menor, melhor) e exatidão." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "Essa estratégia bem simples atinge uma exatidão de cerca de 86%." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Crie um gráfico de exatidão e perda ao longo do tempo\n", + "\n", + "`model.fit()` retorna um objeto `History` que contém um dicionário com tudo o que aconteceu durante o treinamento:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "Há quatro entradas: uma para cada métrica monitorada durante o treinamento e a validação. Você usará esses valores para plotar a perda do treinamento e da validação para fins comparativos, além da exatidão do treinamento e da validação:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "Neste gráfico, os pontos representam a perda e exatidão do treinamento, enquanto as linhas sólidas representam a perda e exatidão da validação.\n", + "\n", + "Observe que a perda do treinamento *diminui* a cada época, e a exatidão do treinamento *aumenta* a cada época. Isso é o esperado ao usar uma otimização do método do gradiente descendente, que deve minimizar a quantidade desejada em cada iteração.\n", + "\n", + "Esse não é o caso para a perda e exatidão de validação, que parecem atingir o pico antes da exatidão do treinamento. Este é um exemplo de overfitting: o modelo tem desempenho melhor com os dados de treinamento em comparação a dados nunca vistos antes. Após esse ponto, o modelo sofre uma sobreotimização e aprende representações *específicas* dos dados de treinamento que não oferecem boas *generalizações* para os dados de teste.\n", + "\n", + "Para este caso específico, é possível evitar o overfitting simplesmente parando o treinamento quando a exatidão da validação deixa de aumentar. Uma forma de fazer isso é usando o callback `tf.keras.callbacks.EarlyStopping`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Exporte o modelo\n", + "\n", + "No código acima, você aplicou a camada `TextVectorization` ao dataset antes de alimentar o modelo com texto. Se quiser tornar o modelo capaz de processar strings brutas (por exemplo, para simplificar a implantação), é possível incluir a camada `TextVectorization` dentro do modelo. Para isso, você pode criar um novo modelo usando os pesos que acabou de treinar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inferência de dados novos\n", + "\n", + "Para fazer previsões para novos exemplos, basta chamar `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Ao incluir a lógica de pré-processamento de texto dentro do modelo, você pode exportar um modelo para produção que simplifica a implantação e reduz o potencial de [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "Há uma diferença de desempenho que você deve considerar ao escolher onde aplicar a camada TextVectorization. Ao usá-la fora do modelo, você pode fazer o processamento assíncrono na CPU e armazenar os dados em buffer ao treinar na GPU. Portanto, se você estiver treinando seu modelo na GPU, deve escolher essa opção para obter o melhor desempenho ao desenvolver o modelo. Depois, quando você estiver pronto para preparar a implantação, inclua a camada TextVectorization dentro do modelo.\n", + "\n", + "Confira este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para saber mais sobre como salvar modelos." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Exercício: classificação multiclasse para perguntas do Stack Overflow\n", + "\n", + "Este tutorial mostrou como treinar um classificador binário do zero usando o dataset do IMDB. Você pode fazer um exercício: modifique este notebook para treinar um classificador multiclasse que preveja a tag de uma pergunta de programação feita no [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "Um [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) foi preparado para uso, contendo o texto de milhares de perguntas de programação (por exemplo, \"Como posso ordenar um dicionário por valor no Python?\") publicadas no Stack Overflow. Cada pergunta é rotulada com exatamente uma tag (Python, CSharp, JavaScript ou Java). Sua tarefa é receber uma pergunta como entrada e prever a tag apropriada, que, neste caso, é Python.\n", + "\n", + "Você usará um dataset que contém milhares de perguntas extraídas do dataset público do Stack Overflow, que é bem maior, no [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), contendo mais de 17 milhões de publicações.\n", + "\n", + "Após baixar o dataset, você verá que ele tem uma estrutura de diretórios similar ao dataset do IMDB utilizado anteriormente:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Observação: para aumentar a dificuldade do problema de classificação, as ocorrências das palavras Python, CSharp, JavaScript e Java nas perguntas de programação foram substituídas pela palavra *blank* (em branco), já que diversas perguntas contêm a linguagem de programação em questão.\n", + "\n", + "Para fazer este exercício, você deve modificar este notebook para que funcione com o dataset do Stack Overflow das seguintes maneiras:\n", + "\n", + "1. Na parte superior do notebook, atualize o código que baixa o dataset do IMDB com o código que baixa o [dataset do Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz), que já foi preparado. Como o dataset do Stack Overflow tem uma estrutura de diretórios parecida, você não precisará fazer muitas modificações.\n", + "\n", + "2. Modifique a última camada do modelo para `Dense(4)`, pois agora há quatro classes de saída.\n", + "\n", + "3. Ao compilar o modelo, altere a perda para `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta é a função de perda correta a ser usada para problemas de classificação muilticlasse, quando os rótulos de cada classe são inteiros (neste caso, podem ser 0, *1*, *2* ou *3*). Além disso, altere as métricas para `metrics=['accuracy']`, já que este é um problema de classificação multicasse (`tf.metrics.BinaryAccuracy` é usado somente para classificadores binários).\n", + "\n", + "4. Ao plotar a precisão ao longo do tempo, altere `binary_accuracy` e `val_binary_accuracy` para `accuracy` e `val_accuracy`, respectivamente.\n", + "\n", + "5. Após fazer essas alterações, você poderá treinar um classificador multiclasse. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Saiba mais\n", + "\n", + "Este tutorial mostrou como fazer a classificação de texto do zero. Para saber mais sobre o workflow de classificação de texto de forma geral, confira o [guia Classificação de texto](https://developers.google.com/machine-learning/guides/text-classification/) no Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/zh-cn/tutorials/keras/text_classification.ipynb b/site/zh-cn/tutorials/keras/text_classification.ipynb index a9beeea6ec..4d42c2cfac 100644 --- a/site/zh-cn/tutorials/keras/text_classification.ipynb +++ b/site/zh-cn/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 电影评论文本分类" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
在 TensorFlow.org 上查看 在 Google Colab 中运行 在 GitHub 上查看源代码 下载笔记本
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "本教程演示了从存储在磁盘上的纯文本文件开始的文本分类。您将训练一个二元分类器对 IMDB 数据集执行情感分析。在笔记本的最后,有一个练习供您尝试,您将在其中训练一个多类分类器来预测 Stack Overflow 上编程问题的标签。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## 情感分析\n", - "\n", - "此笔记本训练了一个情感分析模型,利用评论文本将电影评论分类为*正面*或*负面*评价。这是一个*二元*(或二类)分类示例,也是一个重要且应用广泛的机器学习问题。\n", - "\n", - "您将使用 [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/),其中包含 [Internet Movie Database](https://www.imdb.com/) 中的 50,000 条电影评论文本 。我们将这些评论分为两组,其中 25,000 条用于训练,另外 25,000 条用于测试。训练集和测试集是*均衡的*,也就是说其中包含相等数量的正面评价和负面评价。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### 下载并探索 IMDB 数据集\n", - "\n", - "我们下载并提取数据集,然后浏览一下目录结构。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` 和 `aclImdb/train/neg` 目录包含许多文本文件,每个文件都是一条电影评论。我们来看看其中的一条评论。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### 加载数据集\n", - "\n", - "接下来,您将从磁盘加载数据并将其准备为适合训练的格式。为此,您将使用有用的 [text_dataset_from_directory](https://tensorflow.google.cn/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 实用工具,它期望的目录结构如下所示。\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "要准备用于二元分类的数据集,磁盘上需要有两个文件夹,分别对应于 `class_a` 和 `class_b`。这些将是正面和负面的电影评论,可以在 `aclImdb/train/pos` 和 `aclImdb/train/neg` 中找到。由于 IMDB 数据集包含其他文件夹,因此您需要在使用此实用工具之前将其移除。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "接下来,您将使用 `text_dataset_from_directory` 实用工具创建带标签的 `tf.data.Dataset`。[tf.data](https://tensorflow.google.cn/guide/data) 是一组强大的数据处理工具。\n", - "\n", - "运行机器学习实验时,最佳做法是将数据集拆成三份:[训练](https://developers.google.com/machine-learning/glossary#training_set)、[验证](https://developers.google.com/machine-learning/glossary#validation_set) 和 [测试](https://developers.google.com/machine-learning/glossary#test-set)。\n", - "\n", - "IMDB 数据集已经分成训练集和测试集,但缺少验证集。我们来通过下面的 `validation_split` 参数,使用 80:20 拆分训练数据来创建验证集。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "如上所示,训练文件夹中有 25,000 个样本,您将使用其中的 80%(或 20,000 个)进行训练。稍后您将看到,您可以通过将数据集直接传递给 `model.fit` 来训练模型。如果您不熟悉 `tf.data`,还可以遍历数据集并打印出一些样本,如下所示。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "请注意,评论包含原始文本(带有标点符号和偶尔出现的 HTML 代码,如 `
`)。我们将在以下部分展示如何处理这些问题。\n", - "\n", - "标签为 0 或 1。要查看它们与正面和负面电影评论的对应关系,可以查看数据集上的 `class_names` 属性。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "接下来,您将创建验证数据集和测试数据集。您将使用训练集中剩余的 5,000 条评论进行验证。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "注:使用 `validation_split` 和 `subset` 参数时,请确保要么指定随机种子,要么传递 `shuffle=False`,这样验证拆分和训练拆分就不会重叠。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### 准备用于训练的数据集\n", - "\n", - "接下来,您将使用有用的 `tf.keras.layers.TextVectorization` 层对数据进行标准化、词例化和向量化。\n", - "\n", - "标准化是指对文本进行预处理,通常是移除标点符号或 HTML 元素以简化数据集。词例化是指将字符串分割成词例(例如,通过空格将句子分割成单个单词)。向量化是指将词例转换为数字,以便将它们输入神经网络。所有这些任务都可以通过这个层完成。\n", - "\n", - "正如您在上面看到的,评论包含各种 HTML 代码,例如 `
`。`TextVectorization` 层(默认情况下会将文本转换为小写并去除标点符号,但不会去除 HTML)中的默认标准化程序不会移除这些代码。您将编写一个自定义标准化函数来移除 HTML。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "注:为了防止[训练-测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(也称为训练-应用偏差),在训练和测试时间对数据进行相同的预处理非常重要。为此,可以将 `TextVectorization` 层直接包含在模型中,如本教程后面所示。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "
接下来,您将创建一个 `TextVectorization` 层。您将使用该层对我们的数据进行标准化、词例化和向量化。您将 `output_mode` 设置为 `int` 以便为每个词例创建唯一的整数索引。\n", - "\n", - "请注意,您使用的是默认拆分函数,以及您在上面定义的自定义标准化函数。您还将为模型定义一些常量,例如显式的最大 `sequence_length`,这会使层将序列填充或截断为精确的 `sequence_length` 值。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "接下来,您将调用 `adapt` 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "注:在调用时请务必仅使用您的训练数据(使用测试集会泄漏信息)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "我们来创建一个函数来查看使用该层预处理一些数据的结果。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "正如您在上面看到的,每个词例都被一个整数替换了。您可以通过在该层上调用 `.get_vocabulary()` 来查找每个整数对应的词例(字符串)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "你几乎已经准备好训练你的模型了。作为最后的预处理步骤,你将在训练、验证和测试数据集上应用之前创建的TextVectorization层。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### 配置数据集以提高性能\n", - "\n", - "以下是加载数据时应该使用的两种重要方法,以确保 I/O 不会阻塞。\n", - "\n", - "从磁盘加载后,`.cache()` 会将数据保存在内存中。这将确保数据集在训练模型时不会成为瓶颈。如果您的数据集太大而无法放入内存,也可以使用此方法创建高性能的磁盘缓存,这比许多小文件的读取效率更高。\n", - "\n", - "`prefetch()` 会在训练时将数据预处理和模型执行重叠。\n", - "\n", - "您可以在[数据性能指南](https://tensorflow.google.cn/guide/data_performance)中深入了解这两种方法,以及如何将数据缓存到磁盘。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### 创建模型\n", - "\n", - "是时候创建您的神经网络了:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "层按顺序堆叠以构建分类器:\n", - "\n", - "1. 第一个层是 `Embedding` 层。此层采用整数编码的评论,并查找每个单词索引的嵌入向量。这些向量是通过模型训练学习到的。向量向输出数组增加了一个维度。得到的维度为:`(batch, sequence, embedding)`。要详细了解嵌入向量,请参阅[单词嵌入向量](https://tensorflow.google.cn/text/guide/word_embeddings)教程。\n", - "2. 接下来,`GlobalAveragePooling1D` 将通过对序列维度求平均值来为每个样本返回一个定长输出向量。这允许模型以尽可能最简单的方式处理变长输入。\n", - "3. 最后一层与单个输出结点密集连接。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 损失函数与优化器\n", - "\n", - "模型训练需要一个损失函数和一个优化器。由于这是一个二元分类问题,并且模型输出概率(具有 Sigmoid 激活的单一单元层),我们将使用 `losses.BinaryCrossentropy` 损失函数。\n", - "\n", - "现在,配置模型以使用优化器和损失函数:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### 训练模型\n", - "\n", - "将 `dataset` 对象传递给 fit 方法,对模型进行训练。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### 评估模型\n", - "\n", - "我们来看一下模型的性能如何。将返回两个值。损失值(loss)(一个表示误差的数字,值越低越好)与准确率(accuracy)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "这种十分简单的方式实现了约 86% 的准确率。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 创建准确率和损失随时间变化的图表\n", - "\n", - "`model.fit()` 会返回包含一个字典的 `History` 对象。该字典包含训练过程中产生的所有信息:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "其中有四个条目:每个条目代表训练和验证过程中的一项监测指标。您可以使用这些指标来绘制用于比较的训练损失和验证损失图表,以及训练准确率和验证准确率图表:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "在该图表中,虚线代表训练损失和准确率,实线代表验证损失和准确率。\n", - "\n", - "请注意,训练损失会逐周期*下降*,而训练准确率则逐周期*上升*。使用梯度下降优化时,这是预期结果,它应该在每次迭代中最大限度减少所需的数量。\n", - "\n", - "但是,对于验证损失和准确率来说则不然——它们似乎会在训练转确率之前达到顶点。这是过拟合的一个例子:模型在训练数据上的表现要好于在之前从未见过的数据上的表现。经过这一点之后,模型会过度优化和学习*特定*于训练数据的表示,但无法*泛化*到测试数据。\n", - "\n", - "对于这种特殊情况,您可以通过在验证准确率不再增加时直接停止训练来防止过度拟合。一种方式是使用 `tf.keras.callbacks.EarlyStopping` 回调。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## 导出模型\n", - "\n", - "在上面的代码中,您在向模型馈送文本之前对数据集应用了 `TextVectorization`。 如果您想让模型能够处理原始字符串(例如,为了简化部署),您可以在模型中包含 `TextVectorization` 层。为此,您可以使用刚刚训练的权重创建一个新模型。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 使用新数据进行推断\n", - "\n", - "要获得对新样本的预测,只需调用 `model.predict()` 即可。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "将文本预处理逻辑包含在模型中后,您可以导出用于生产的模型,从而简化部署并降低[训练/测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)的可能性。\n", - "\n", - "在选择应用 TextVectorization 层的位置时,需要注意性能差异。在模型之外使用它可以让您在 GPU 上训练时进行异步 CPU 处理和数据缓冲。因此,如果您在 GPU 上训练模型,您应该在开发模型时使用此选项以获得最佳性能,然后在准备好部署时进行切换,在模型中包含 TextVectorization 层。\n", - "\n", - "请参阅此[教程](https://tensorflow.google.cn/tutorials/keras/save_and_load),详细了解如何保存模型。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 练习:对 Stack Overflow 问题进行多类分类\n", - "\n", - "本教程展示了如何在 IMDB 数据集上从头开始训练二元分类器。作为练习,您可以修改此笔记本以训练多类分类器来预测 [Stack Overflow](http://stackoverflow.com/) 上的编程问题的标签。\n", - "\n", - "我们已经准备好了一个[数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)供您使用,其中包含了几千个发布在 Stack Overflow 上的编程问题(例如,\"How can sort a dictionary by value in Python?\")。每一个问题都只有一个标签(Python、CSharp、JavaScript 或 Java)。您的任务是将问题作为输入,并预测适当的标签,在本例中为 Python。\n", - "\n", - "您将使用的数据集包含从 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) 上更大的公共 Stack Overflow 数据集提取的数千个问题,其中包含超过 1700 万个帖子。\n", - "\n", - "下载数据集后,您会发现它与您之前使用的 IMDB 数据集具有相似的目录结构:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "注:为了增加分类问题的难度,编程问题中出现的 Python、CSharp、JavaScript 或 Java 等词已被替换为 *blank*(因为许多问题都包含它们所涉及的语言)。\n", - "\n", - "要完成此练习,您应该对此笔记本进行以下修改以使用 Stack Overflow 数据集:\n", - "\n", - "1. 在笔记本顶部,将下载 IMDB 数据集的代码更新为下载前面准备好的 [Stack Overflow 数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)的代码。由于 Stack Overflow 数据集具有类似的目录结构,因此您不需要进行太多修改。\n", - "\n", - "2. 将模型的最后一层修改为 `Dense(4)`,因为现在有四个输出类。\n", - "\n", - "3. 编译模型时,将损失更改为 `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`。当每个类的标签是整数(在本例中,它们可以是 0、*1*、*2* 或 *3*)时,这是用于多类分类问题的正确损失函数。 此外,将指标更改为 `metrics=['accuracy']`,因为这是一个多类分类问题(`tf.metrics.BinaryAccuracy` 仅用于二元分类器 )。\n", - "\n", - "4. 在绘制随时间变化的准确率时,请将 `binary_accuracy` 和 `val_binary_accuracy` 分别更改为 `accuracy` 和 `val_accuracy`。\n", - "\n", - "5. 完成这些更改后,就可以训练多类分类器了。 " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 了解更多信息\n", - "\n", - "本教程从头开始介绍了文本分类。要详细了解一般的文本分类工作流程,请查看 Google Developers 提供的[文本分类指南](https://developers.google.com/machine-learning/guides/text-classification/)。\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 电影评论文本分类" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
在 TensorFlow.org 上查看 在 Google Colab 中运行 在 GitHub 上查看源代码 下载笔记本
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "本教程演示了从存储在磁盘上的纯文本文件开始的文本分类。您将训练一个二元分类器对 IMDB 数据集执行情感分析。在笔记本的最后,有一个练习供您尝试,您将在其中训练一个多类分类器来预测 Stack Overflow 上编程问题的标签。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## 情感分析\n", + "\n", + "此笔记本训练了一个情感分析模型,利用评论文本将电影评论分类为*正面*或*负面*评价。这是一个*二元*(或二类)分类示例,也是一个重要且应用广泛的机器学习问题。\n", + "\n", + "您将使用 [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/),其中包含 [Internet Movie Database](https://www.imdb.com/) 中的 50,000 条电影评论文本 。我们将这些评论分为两组,其中 25,000 条用于训练,另外 25,000 条用于测试。训练集和测试集是*均衡的*,也就是说其中包含相等数量的正面评价和负面评价。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### 下载并探索 IMDB 数据集\n", + "\n", + "我们下载并提取数据集,然后浏览一下目录结构。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` 和 `aclImdb/train/neg` 目录包含许多文本文件,每个文件都是一条电影评论。我们来看看其中的一条评论。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### 加载数据集\n", + "\n", + "接下来,您将从磁盘加载数据并将其准备为适合训练的格式。为此,您将使用有用的 [text_dataset_from_directory](https://tensorflow.google.cn/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 实用工具,它期望的目录结构如下所示。\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "要准备用于二元分类的数据集,磁盘上需要有两个文件夹,分别对应于 `class_a` 和 `class_b`。这些将是正面和负面的电影评论,可以在 `aclImdb/train/pos` 和 `aclImdb/train/neg` 中找到。由于 IMDB 数据集包含其他文件夹,因此您需要在使用此实用工具之前将其移除。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "接下来,您将使用 `text_dataset_from_directory` 实用工具创建带标签的 `tf.data.Dataset`。[tf.data](https://tensorflow.google.cn/guide/data) 是一组强大的数据处理工具。\n", + "\n", + "运行机器学习实验时,最佳做法是将数据集拆成三份:[训练](https://developers.google.com/machine-learning/glossary#training_set)、[验证](https://developers.google.com/machine-learning/glossary#validation_set) 和 [测试](https://developers.google.com/machine-learning/glossary#test-set)。\n", + "\n", + "IMDB 数据集已经分成训练集和测试集,但缺少验证集。我们来通过下面的 `validation_split` 参数,使用 80:20 拆分训练数据来创建验证集。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "如上所示,训练文件夹中有 25,000 个样本,您将使用其中的 80%(或 20,000 个)进行训练。稍后您将看到,您可以通过将数据集直接传递给 `model.fit` 来训练模型。如果您不熟悉 `tf.data`,还可以遍历数据集并打印出一些样本,如下所示。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "请注意,评论包含原始文本(带有标点符号和偶尔出现的 HTML 代码,如 `
`)。我们将在以下部分展示如何处理这些问题。\n", + "\n", + "标签为 0 或 1。要查看它们与正面和负面电影评论的对应关系,可以查看数据集上的 `class_names` 属性。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "接下来,您将创建验证数据集和测试数据集。您将使用训练集中剩余的 5,000 条评论进行验证。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "注:使用 `validation_split` 和 `subset` 参数时,请确保要么指定随机种子,要么传递 `shuffle=False`,这样验证拆分和训练拆分就不会重叠。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### 准备用于训练的数据集\n", + "\n", + "接下来,您将使用有用的 `tf.keras.layers.TextVectorization` 层对数据进行标准化、词例化和向量化。\n", + "\n", + "标准化是指对文本进行预处理,通常是移除标点符号或 HTML 元素以简化数据集。词例化是指将字符串分割成词例(例如,通过空格将句子分割成单个单词)。向量化是指将词例转换为数字,以便将它们输入神经网络。所有这些任务都可以通过这个层完成。\n", + "\n", + "正如您在上面看到的,评论包含各种 HTML 代码,例如 `
`。`TextVectorization` 层(默认情况下会将文本转换为小写并去除标点符号,但不会去除 HTML)中的默认标准化程序不会移除这些代码。您将编写一个自定义标准化函数来移除 HTML。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "注:为了防止[训练-测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(也称为训练-应用偏差),在训练和测试时间对数据进行相同的预处理非常重要。为此,可以将 `TextVectorization` 层直接包含在模型中,如本教程后面所示。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "
接下来,您将创建一个 `TextVectorization` 层。您将使用该层对我们的数据进行标准化、词例化和向量化。您将 `output_mode` 设置为 `int` 以便为每个词例创建唯一的整数索引。\n", + "\n", + "请注意,您使用的是默认拆分函数,以及您在上面定义的自定义标准化函数。您还将为模型定义一些常量,例如显式的最大 `sequence_length`,这会使层将序列填充或截断为精确的 `sequence_length` 值。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "接下来,您将调用 `adapt` 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "注:在调用时请务必仅使用您的训练数据(使用测试集会泄漏信息)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "我们来创建一个函数来查看使用该层预处理一些数据的结果。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "正如您在上面看到的,每个词例都被一个整数替换了。您可以通过在该层上调用 `.get_vocabulary()` 来查找每个整数对应的词例(字符串)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "你几乎已经准备好训练你的模型了。作为最后的预处理步骤,你将在训练、验证和测试数据集上应用之前创建的TextVectorization层。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### 配置数据集以提高性能\n", + "\n", + "以下是加载数据时应该使用的两种重要方法,以确保 I/O 不会阻塞。\n", + "\n", + "从磁盘加载后,`.cache()` 会将数据保存在内存中。这将确保数据集在训练模型时不会成为瓶颈。如果您的数据集太大而无法放入内存,也可以使用此方法创建高性能的磁盘缓存,这比许多小文件的读取效率更高。\n", + "\n", + "`prefetch()` 会在训练时将数据预处理和模型执行重叠。\n", + "\n", + "您可以在[数据性能指南](https://tensorflow.google.cn/guide/data_performance)中深入了解这两种方法,以及如何将数据缓存到磁盘。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### 创建模型\n", + "\n", + "是时候创建您的神经网络了:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "层按顺序堆叠以构建分类器:\n", + "\n", + "1. 第一个层是 `Embedding` 层。此层采用整数编码的评论,并查找每个单词索引的嵌入向量。这些向量是通过模型训练学习到的。向量向输出数组增加了一个维度。得到的维度为:`(batch, sequence, embedding)`。要详细了解嵌入向量,请参阅[单词嵌入向量](https://tensorflow.google.cn/text/guide/word_embeddings)教程。\n", + "2. 接下来,`GlobalAveragePooling1D` 将通过对序列维度求平均值来为每个样本返回一个定长输出向量。这允许模型以尽可能最简单的方式处理变长输入。\n", + "3. 最后一层与单个输出结点密集连接。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 损失函数与优化器\n", + "\n", + "模型训练需要一个损失函数和一个优化器。由于这是一个二元分类问题,并且模型输出概率(具有 Sigmoid 激活的单一单元层),我们将使用 `losses.BinaryCrossentropy` 损失函数。\n", + "\n", + "现在,配置模型以使用优化器和损失函数:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### 训练模型\n", + "\n", + "将 `dataset` 对象传递给 fit 方法,对模型进行训练。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### 评估模型\n", + "\n", + "我们来看一下模型的性能如何。将返回两个值。损失值(loss)(一个表示误差的数字,值越低越好)与准确率(accuracy)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "这种十分简单的方式实现了约 86% 的准确率。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 创建准确率和损失随时间变化的图表\n", + "\n", + "`model.fit()` 会返回包含一个字典的 `History` 对象。该字典包含训练过程中产生的所有信息:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "其中有四个条目:每个条目代表训练和验证过程中的一项监测指标。您可以使用这些指标来绘制用于比较的训练损失和验证损失图表,以及训练准确率和验证准确率图表:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "在该图表中,虚线代表训练损失和准确率,实线代表验证损失和准确率。\n", + "\n", + "请注意,训练损失会逐周期*下降*,而训练准确率则逐周期*上升*。使用梯度下降优化时,这是预期结果,它应该在每次迭代中最大限度减少所需的数量。\n", + "\n", + "但是,对于验证损失和准确率来说则不然——它们似乎会在训练转确率之前达到顶点。这是过拟合的一个例子:模型在训练数据上的表现要好于在之前从未见过的数据上的表现。经过这一点之后,模型会过度优化和学习*特定*于训练数据的表示,但无法*泛化*到测试数据。\n", + "\n", + "对于这种特殊情况,您可以通过在验证准确率不再增加时直接停止训练来防止过度拟合。一种方式是使用 `tf.keras.callbacks.EarlyStopping` 回调。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## 导出模型\n", + "\n", + "在上面的代码中,您在向模型馈送文本之前对数据集应用了 `TextVectorization`。 如果您想让模型能够处理原始字符串(例如,为了简化部署),您可以在模型中包含 `TextVectorization` 层。为此,您可以使用刚刚训练的权重创建一个新模型。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 使用新数据进行推断\n", + "\n", + "要获得对新样本的预测,只需调用 `model.predict()` 即可。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "将文本预处理逻辑包含在模型中后,您可以导出用于生产的模型,从而简化部署并降低[训练/测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)的可能性。\n", + "\n", + "在选择应用 TextVectorization 层的位置时,需要注意性能差异。在模型之外使用它可以让您在 GPU 上训练时进行异步 CPU 处理和数据缓冲。因此,如果您在 GPU 上训练模型,您应该在开发模型时使用此选项以获得最佳性能,然后在准备好部署时进行切换,在模型中包含 TextVectorization 层。\n", + "\n", + "请参阅此[教程](https://tensorflow.google.cn/tutorials/keras/save_and_load),详细了解如何保存模型。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 练习:对 Stack Overflow 问题进行多类分类\n", + "\n", + "本教程展示了如何在 IMDB 数据集上从头开始训练二元分类器。作为练习,您可以修改此笔记本以训练多类分类器来预测 [Stack Overflow](http://stackoverflow.com/) 上的编程问题的标签。\n", + "\n", + "我们已经准备好了一个[数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)供您使用,其中包含了几千个发布在 Stack Overflow 上的编程问题(例如,\"How can sort a dictionary by value in Python?\")。每一个问题都只有一个标签(Python、CSharp、JavaScript 或 Java)。您的任务是将问题作为输入,并预测适当的标签,在本例中为 Python。\n", + "\n", + "您将使用的数据集包含从 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) 上更大的公共 Stack Overflow 数据集提取的数千个问题,其中包含超过 1700 万个帖子。\n", + "\n", + "下载数据集后,您会发现它与您之前使用的 IMDB 数据集具有相似的目录结构:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "注:为了增加分类问题的难度,编程问题中出现的 Python、CSharp、JavaScript 或 Java 等词已被替换为 *blank*(因为许多问题都包含它们所涉及的语言)。\n", + "\n", + "要完成此练习,您应该对此笔记本进行以下修改以使用 Stack Overflow 数据集:\n", + "\n", + "1. 在笔记本顶部,将下载 IMDB 数据集的代码更新为下载前面准备好的 [Stack Overflow 数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)的代码。由于 Stack Overflow 数据集具有类似的目录结构,因此您不需要进行太多修改。\n", + "\n", + "2. 将模型的最后一层修改为 `Dense(4)`,因为现在有四个输出类。\n", + "\n", + "3. 编译模型时,将损失更改为 `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`。当每个类的标签是整数(在本例中,它们可以是 0、*1*、*2* 或 *3*)时,这是用于多类分类问题的正确损失函数。 此外,将指标更改为 `metrics=['accuracy']`,因为这是一个多类分类问题(`tf.metrics.BinaryAccuracy` 仅用于二元分类器 )。\n", + "\n", + "4. 在绘制随时间变化的准确率时,请将 `binary_accuracy` 和 `val_binary_accuracy` 分别更改为 `accuracy` 和 `val_accuracy`。\n", + "\n", + "5. 完成这些更改后,就可以训练多类分类器了。 " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 了解更多信息\n", + "\n", + "本教程从头开始介绍了文本分类。要详细了解一般的文本分类工作流程,请查看 Google Developers 提供的[文本分类指南](https://developers.google.com/machine-learning/guides/text-classification/)。\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From 50f48fe319fbaa451731e11571d0affba69c41de Mon Sep 17 00:00:00 2001 From: niushuaibing Date: Fri, 14 Feb 2025 16:10:58 +0800 Subject: [PATCH 2/3] Revert "fix path error" This reverts commit 809160bc663cebfd14b09e7ae3467d2ead43b703. --- .../tutorials/keras/text_classification.ipynb | 1960 ++++++++--------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1952 ++++++++-------- .../tutorials/keras/text_classification.ipynb | 1944 ++++++++-------- 6 files changed, 5844 insertions(+), 5844 deletions(-) diff --git a/site/en-snapshot/tutorials/keras/text_classification.ipynb b/site/en-snapshot/tutorials/keras/text_classification.ipynb index f6c1506722..4182c3f295 100644 --- a/site/en-snapshot/tutorials/keras/text_classification.ipynb +++ b/site/en-snapshot/tutorials/keras/text_classification.ipynb @@ -1,982 +1,982 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Basic text classification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " View on TensorFlow.org\n", - " \n", - " Run in Google Colab\n", - " \n", - " View source on GitHub\n", - " \n", - " Download notebook\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "This tutorial demonstrates text classification starting from plain text files stored on disk. You'll train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try, in which you'll train a multi-class classifier to predict the tag for a programming question on Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Sentiment analysis\n", - "\n", - "This notebook trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the text of the review. This is an example of *binary*—or two-class—classification, an important and widely applicable kind of machine learning problem.\n", - "\n", - "You'll use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/). These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are *balanced*, meaning they contain an equal number of positive and negative reviews.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Download and explore the IMDB dataset\n", - "\n", - "Let's download and extract the dataset, then explore the directory structure." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "The `aclImdb/train/pos` and `aclImdb/train/neg` directories contain many text files, each of which is a single movie review. Let's take a look at one of them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Load the dataset\n", - "\n", - "Next, you will load the data off disk and prepare it into a format suitable for training. To do so, you will use the helpful [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) utility, which expects a directory structure as follows.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "To prepare a dataset for binary classification, you will need two folders on disk, corresponding to `class_a` and `class_b`. These will be the positive and negative movie reviews, which can be found in `aclImdb/train/pos` and `aclImdb/train/neg`. As the IMDB dataset contains additional folders, you will remove them before using this utility." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Next, you will use the `text_dataset_from_directory` utility to create a labeled `tf.data.Dataset`. [tf.data](https://www.tensorflow.org/guide/data) is a powerful collection of tools for working with data. \n", - "\n", - "When running a machine learning experiment, it is a best practice to divide your dataset into three splits: [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set), and [test](https://developers.google.com/machine-learning/glossary#test-set). \n", - "\n", - "The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation set using an 80:20 split of the training data by using the `validation_split` argument below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "As you can see above, there are 25,000 examples in the training folder, of which you will use 80% (or 20,000) for training. As you will see in a moment, you can train a model by passing a dataset directly to `model.fit`. If you're new to `tf.data`, you can also iterate over the dataset and print out a few examples as follows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Notice the reviews contain raw text (with punctuation and occasional HTML tags like `
`). You will show how to handle these in the following section. \n", - "\n", - "The labels are 0 or 1. To see which of these correspond to positive and negative movie reviews, you can check the `class_names` property on the dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "Next, you will create a validation and test dataset. You will use the remaining 5,000 reviews from the training set for validation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Note: When using the `validation_split` and `subset` arguments, make sure to either specify a random seed, or to pass `shuffle=False`, so that the validation and training splits have no overlap." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Prepare the dataset for training\n", - "\n", - "Next, you will standardize, tokenize, and vectorize the data using the helpful `tf.keras.layers.TextVectorization` layer. \n", - "\n", - "Standardization refers to preprocessing the text, typically to remove punctuation or HTML elements to simplify the dataset. Tokenization refers to splitting strings into tokens (for example, splitting a sentence into individual words, by splitting on whitespace). Vectorization refers to converting tokens into numbers so they can be fed into a neural network. All of these tasks can be accomplished with this layer.\n", - "\n", - "As you saw above, the reviews contain various HTML tags like `
`. These tags will not be removed by the default standardizer in the `TextVectorization` layer (which converts text to lowercase and strips punctuation by default, but doesn't strip HTML). You will write a custom standardization function to remove the HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Note: To prevent [training-testing skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (also known as training-serving skew), it is important to preprocess the data identically at train and test time. To facilitate this, the `TextVectorization` layer can be included directly inside your model, as shown later in this tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Next, you will create a `TextVectorization` layer. You will use this layer to standardize, tokenize, and vectorize our data. You set the `output_mode` to `int` to create unique integer indices for each token.\n", - "\n", - "Note that you're using the default split function, and the custom standardization function you defined above. You'll also define some constants for the model, like an explicit maximum `sequence_length`, which will cause the layer to pad or truncate sequences to exactly `sequence_length` values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "Next, you will call `adapt` to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Note: It's important to only use your training data when calling adapt (using the test set would leak information)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Let's create a function to see the result of using this layer to preprocess some data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "As you can see above, each token has been replaced by an integer. You can lookup the token (string) that each integer corresponds to by calling `.get_vocabulary()` on the layer." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "You are nearly ready to train your model. As a final preprocessing step, you will apply the TextVectorization layer you created earlier to the train, validation, and test dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configure the dataset for performance\n", - "\n", - "These are two important methods you should use when loading data to make sure that I/O does not become blocking.\n", - "\n", - "`.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.\n", - "\n", - "`.prefetch()` overlaps data preprocessing and model execution while training. \n", - "\n", - "You can learn more about both methods, as well as how to cache data to disk in the [data performance guide](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Create the model\n", - "\n", - "It's time to create your neural network:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "The layers are stacked sequentially to build the classifier:\n", - "\n", - "1. The first layer is an `Embedding` layer. This layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: `(batch, sequence, embedding)`. To learn more about embeddings, check out the [Word embeddings](https://www.tensorflow.org/text/guide/word_embeddings) tutorial.\n", - "2. Next, a `GlobalAveragePooling1D` layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.\n", - "3. The last layer is densely connected with a single output node." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Loss function and optimizer\n", - "\n", - "A model needs a loss function and an optimizer for training. Since this is a binary classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), you'll use `losses.BinaryCrossentropy` loss function.\n", - "\n", - "Now, configure the model to use an optimizer and a loss function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Train the model\n", - "\n", - "You will train the model by passing the `dataset` object to the fit method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Evaluate the model\n", - "\n", - "Let's see how the model performs. Two values will be returned. Loss (a number which represents our error, lower values are better), and accuracy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "This fairly naive approach achieves an accuracy of about 86%." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Create a plot of accuracy and loss over time\n", - "\n", - "`model.fit()` returns a `History` object that contains a dictionary with everything that happened during training:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "There are four entries: one for each monitored metric during training and validation. You can use these to plot the training and validation loss for comparison, as well as the training and validation accuracy:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.\n", - "\n", - "Notice the training loss *decreases* with each epoch and the training accuracy *increases* with each epoch. This is expected when using a gradient descent optimization—it should minimize the desired quantity on every iteration.\n", - "\n", - "This isn't the case for the validation loss and accuracy—they seem to peak before the training accuracy. This is an example of overfitting: the model performs better on the training data than it does on data it has never seen before. After this point, the model over-optimizes and learns representations *specific* to the training data that do not *generalize* to test data.\n", - "\n", - "For this particular case, you could prevent overfitting by simply stopping the training when the validation accuracy is no longer increasing. One way to do so is to use the `tf.keras.callbacks.EarlyStopping` callback." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Export the model\n", - "\n", - "In the code above, you applied the `TextVectorization` layer to the dataset before feeding text to the model. If you want to make your model capable of processing raw strings (for example, to simplify deploying it), you can include the `TextVectorization` layer inside your model. To do so, you can create a new model using the weights you just trained." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inference on new data\n", - "\n", - "To get predictions for new examples, you can simply call `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Including the text preprocessing logic inside your model enables you to export a model for production that simplifies deployment, and reduces the potential for [train/test skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "There is a performance difference to keep in mind when choosing where to apply your TextVectorization layer. Using it outside of your model enables you to do asynchronous CPU processing and buffering of your data when training on GPU. So, if you're training your model on the GPU, you probably want to go with this option to get the best performance while developing your model, then switch to including the TextVectorization layer inside your model when you're ready to prepare for deployment.\n", - "\n", - "Visit this [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) to learn more about saving models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Exercise: multi-class classification on Stack Overflow questions\n", - "\n", - "This tutorial showed how to train a binary classifier from scratch on the IMDB dataset. As an exercise, you can modify this notebook to train a multi-class classifier to predict the tag of a programming question on [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "A [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) has been prepared for you to use containing the body of several thousand programming questions (for example, \"How can I sort a dictionary by value in Python?\") posted to Stack Overflow. Each of these is labeled with exactly one tag (either Python, CSharp, JavaScript, or Java). Your task is to take a question as input, and predict the appropriate tag, in this case, Python. \n", - "\n", - "The dataset you will work with contains several thousand questions extracted from the much larger public Stack Overflow dataset on [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), which contains more than 17 million posts.\n", - "\n", - "After downloading the dataset, you will find it has a similar directory structure to the IMDB dataset you worked with previously:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Note: To increase the difficulty of the classification problem, occurrences of the words Python, CSharp, JavaScript, or Java in the programming questions have been replaced with the word *blank* (as many questions contain the language they're about).\n", - "\n", - "To complete this exercise, you should modify this notebook to work with the Stack Overflow dataset by making the following modifications:\n", - "\n", - "1. At the top of your notebook, update the code that downloads the IMDB dataset with code to download the [Stack Overflow dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) that has already been prepared. As the Stack Overflow dataset has a similar directory structure, you will not need to make many modifications.\n", - "\n", - "1. Modify the last layer of your model to `Dense(4)`, as there are now four output classes.\n", - "\n", - "1. When compiling the model, change the loss to `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. This is the correct loss function to use for a multi-class classification problem, when the labels for each class are integers (in this case, they can be 0, *1*, *2*, or *3*). In addition, change the metrics to `metrics=['accuracy']`, since this is a multi-class classification problem (`tf.metrics.BinaryAccuracy` is only used for binary classifiers).\n", - "\n", - "1. When plotting accuracy over time, change `binary_accuracy` and `val_binary_accuracy` to `accuracy` and `val_accuracy`, respectively.\n", - "\n", - "1. Once these changes are complete, you will be able to train a multi-class classifier. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Learning more\n", - "\n", - "This tutorial introduced text classification from scratch. To learn more about the text classification workflow in general, check out the [Text classification guide](https://developers.google.com/machine-learning/guides/text-classification/) from Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Basic text classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " View on TensorFlow.org\n", + " \n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + " \n", + " Download notebook\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "This tutorial demonstrates text classification starting from plain text files stored on disk. You'll train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try, in which you'll train a multi-class classifier to predict the tag for a programming question on Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Sentiment analysis\n", + "\n", + "This notebook trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the text of the review. This is an example of *binary*—or two-class—classification, an important and widely applicable kind of machine learning problem.\n", + "\n", + "You'll use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/). These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are *balanced*, meaning they contain an equal number of positive and negative reviews.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Download and explore the IMDB dataset\n", + "\n", + "Let's download and extract the dataset, then explore the directory structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "The `aclImdb/train/pos` and `aclImdb/train/neg` directories contain many text files, each of which is a single movie review. Let's take a look at one of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Load the dataset\n", + "\n", + "Next, you will load the data off disk and prepare it into a format suitable for training. To do so, you will use the helpful [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) utility, which expects a directory structure as follows.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "To prepare a dataset for binary classification, you will need two folders on disk, corresponding to `class_a` and `class_b`. These will be the positive and negative movie reviews, which can be found in `aclImdb/train/pos` and `aclImdb/train/neg`. As the IMDB dataset contains additional folders, you will remove them before using this utility." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Next, you will use the `text_dataset_from_directory` utility to create a labeled `tf.data.Dataset`. [tf.data](https://www.tensorflow.org/guide/data) is a powerful collection of tools for working with data. \n", + "\n", + "When running a machine learning experiment, it is a best practice to divide your dataset into three splits: [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set), and [test](https://developers.google.com/machine-learning/glossary#test-set). \n", + "\n", + "The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation set using an 80:20 split of the training data by using the `validation_split` argument below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "As you can see above, there are 25,000 examples in the training folder, of which you will use 80% (or 20,000) for training. As you will see in a moment, you can train a model by passing a dataset directly to `model.fit`. If you're new to `tf.data`, you can also iterate over the dataset and print out a few examples as follows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Notice the reviews contain raw text (with punctuation and occasional HTML tags like `
`). You will show how to handle these in the following section. \n", + "\n", + "The labels are 0 or 1. To see which of these correspond to positive and negative movie reviews, you can check the `class_names` property on the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "Next, you will create a validation and test dataset. You will use the remaining 5,000 reviews from the training set for validation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Note: When using the `validation_split` and `subset` arguments, make sure to either specify a random seed, or to pass `shuffle=False`, so that the validation and training splits have no overlap." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Prepare the dataset for training\n", + "\n", + "Next, you will standardize, tokenize, and vectorize the data using the helpful `tf.keras.layers.TextVectorization` layer. \n", + "\n", + "Standardization refers to preprocessing the text, typically to remove punctuation or HTML elements to simplify the dataset. Tokenization refers to splitting strings into tokens (for example, splitting a sentence into individual words, by splitting on whitespace). Vectorization refers to converting tokens into numbers so they can be fed into a neural network. All of these tasks can be accomplished with this layer.\n", + "\n", + "As you saw above, the reviews contain various HTML tags like `
`. These tags will not be removed by the default standardizer in the `TextVectorization` layer (which converts text to lowercase and strips punctuation by default, but doesn't strip HTML). You will write a custom standardization function to remove the HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Note: To prevent [training-testing skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (also known as training-serving skew), it is important to preprocess the data identically at train and test time. To facilitate this, the `TextVectorization` layer can be included directly inside your model, as shown later in this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Next, you will create a `TextVectorization` layer. You will use this layer to standardize, tokenize, and vectorize our data. You set the `output_mode` to `int` to create unique integer indices for each token.\n", + "\n", + "Note that you're using the default split function, and the custom standardization function you defined above. You'll also define some constants for the model, like an explicit maximum `sequence_length`, which will cause the layer to pad or truncate sequences to exactly `sequence_length` values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "Next, you will call `adapt` to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Note: It's important to only use your training data when calling adapt (using the test set would leak information)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Let's create a function to see the result of using this layer to preprocess some data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "As you can see above, each token has been replaced by an integer. You can lookup the token (string) that each integer corresponds to by calling `.get_vocabulary()` on the layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "You are nearly ready to train your model. As a final preprocessing step, you will apply the TextVectorization layer you created earlier to the train, validation, and test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configure the dataset for performance\n", + "\n", + "These are two important methods you should use when loading data to make sure that I/O does not become blocking.\n", + "\n", + "`.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.\n", + "\n", + "`.prefetch()` overlaps data preprocessing and model execution while training. \n", + "\n", + "You can learn more about both methods, as well as how to cache data to disk in the [data performance guide](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Create the model\n", + "\n", + "It's time to create your neural network:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "The layers are stacked sequentially to build the classifier:\n", + "\n", + "1. The first layer is an `Embedding` layer. This layer takes the integer-encoded reviews and looks up an embedding vector for each word-index. These vectors are learned as the model trains. The vectors add a dimension to the output array. The resulting dimensions are: `(batch, sequence, embedding)`. To learn more about embeddings, check out the [Word embeddings](https://www.tensorflow.org/text/guide/word_embeddings) tutorial.\n", + "2. Next, a `GlobalAveragePooling1D` layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.\n", + "3. The last layer is densely connected with a single output node." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Loss function and optimizer\n", + "\n", + "A model needs a loss function and an optimizer for training. Since this is a binary classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), you'll use `losses.BinaryCrossentropy` loss function.\n", + "\n", + "Now, configure the model to use an optimizer and a loss function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Train the model\n", + "\n", + "You will train the model by passing the `dataset` object to the fit method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Evaluate the model\n", + "\n", + "Let's see how the model performs. Two values will be returned. Loss (a number which represents our error, lower values are better), and accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "This fairly naive approach achieves an accuracy of about 86%." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Create a plot of accuracy and loss over time\n", + "\n", + "`model.fit()` returns a `History` object that contains a dictionary with everything that happened during training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "There are four entries: one for each monitored metric during training and validation. You can use these to plot the training and validation loss for comparison, as well as the training and validation accuracy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.\n", + "\n", + "Notice the training loss *decreases* with each epoch and the training accuracy *increases* with each epoch. This is expected when using a gradient descent optimization—it should minimize the desired quantity on every iteration.\n", + "\n", + "This isn't the case for the validation loss and accuracy—they seem to peak before the training accuracy. This is an example of overfitting: the model performs better on the training data than it does on data it has never seen before. After this point, the model over-optimizes and learns representations *specific* to the training data that do not *generalize* to test data.\n", + "\n", + "For this particular case, you could prevent overfitting by simply stopping the training when the validation accuracy is no longer increasing. One way to do so is to use the `tf.keras.callbacks.EarlyStopping` callback." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Export the model\n", + "\n", + "In the code above, you applied the `TextVectorization` layer to the dataset before feeding text to the model. If you want to make your model capable of processing raw strings (for example, to simplify deploying it), you can include the `TextVectorization` layer inside your model. To do so, you can create a new model using the weights you just trained." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inference on new data\n", + "\n", + "To get predictions for new examples, you can simply call `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Including the text preprocessing logic inside your model enables you to export a model for production that simplifies deployment, and reduces the potential for [train/test skew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "There is a performance difference to keep in mind when choosing where to apply your TextVectorization layer. Using it outside of your model enables you to do asynchronous CPU processing and buffering of your data when training on GPU. So, if you're training your model on the GPU, you probably want to go with this option to get the best performance while developing your model, then switch to including the TextVectorization layer inside your model when you're ready to prepare for deployment.\n", + "\n", + "Visit this [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) to learn more about saving models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Exercise: multi-class classification on Stack Overflow questions\n", + "\n", + "This tutorial showed how to train a binary classifier from scratch on the IMDB dataset. As an exercise, you can modify this notebook to train a multi-class classifier to predict the tag of a programming question on [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "A [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) has been prepared for you to use containing the body of several thousand programming questions (for example, \"How can I sort a dictionary by value in Python?\") posted to Stack Overflow. Each of these is labeled with exactly one tag (either Python, CSharp, JavaScript, or Java). Your task is to take a question as input, and predict the appropriate tag, in this case, Python. \n", + "\n", + "The dataset you will work with contains several thousand questions extracted from the much larger public Stack Overflow dataset on [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), which contains more than 17 million posts.\n", + "\n", + "After downloading the dataset, you will find it has a similar directory structure to the IMDB dataset you worked with previously:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Note: To increase the difficulty of the classification problem, occurrences of the words Python, CSharp, JavaScript, or Java in the programming questions have been replaced with the word *blank* (as many questions contain the language they're about).\n", + "\n", + "To complete this exercise, you should modify this notebook to work with the Stack Overflow dataset by making the following modifications:\n", + "\n", + "1. At the top of your notebook, update the code that downloads the IMDB dataset with code to download the [Stack Overflow dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) that has already been prepared. As the Stack Overflow dataset has a similar directory structure, you will not need to make many modifications.\n", + "\n", + "1. Modify the last layer of your model to `Dense(4)`, as there are now four output classes.\n", + "\n", + "1. When compiling the model, change the loss to `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. This is the correct loss function to use for a multi-class classification problem, when the labels for each class are integers (in this case, they can be 0, *1*, *2*, or *3*). In addition, change the metrics to `metrics=['accuracy']`, since this is a multi-class classification problem (`tf.metrics.BinaryAccuracy` is only used for binary classifiers).\n", + "\n", + "1. When plotting accuracy over time, change `binary_accuracy` and `val_binary_accuracy` to `accuracy` and `val_accuracy`, respectively.\n", + "\n", + "1. Once these changes are complete, you will be able to train a multi-class classifier. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Learning more\n", + "\n", + "This tutorial introduced text classification from scratch. To learn more about the text classification workflow in general, check out the [Text classification guide](https://developers.google.com/machine-learning/guides/text-classification/) from Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/es-419/tutorials/keras/text_classification.ipynb b/site/es-419/tutorials/keras/text_classification.ipynb index c49131f1b0..889018a71c 100644 --- a/site/es-419/tutorials/keras/text_classification.ipynb +++ b/site/es-419/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Clasificación básica de textos" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
Ver en TensorFlow.orgEjecutar en Google Colab Ver fuente en GitHub Descargar notebook
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "En este tutorial se muestra la clasificación de textos a partir de archivos de texto plano almacenados en un disco. Entrenará un clasificador binario para que analice los sentimientos de un conjunto de datos de IMDB. Al final del bloc de notas, hay un ejercicio para que lo ponga a prueba, en el que entrenará un clasificador multiclase para predecir la etiqueta de una pregunta de programación de Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Análisis de sentimientos\n", - "\n", - "En este bloc de notas se entrena un modelo de análisis de sentimiento para clasificar reseñas de películas como *positivas* o *negativas* a partir del texto de la reseña. Este es un ejemplo de clasificación *binaria* (o de dos clases), un tipo de problema de aprendizaje automático importante y ampliamente aplicable.\n", - "\n", - "Usará los [enormes conjuntos de datos de reseñas de películas](https://ai.stanford.edu/~amaas/data/sentiment/) que contienen el texto de 50 000 reseñas de películas de [Internet Movie Database](https://www.imdb.com/). Se divide en 25 000 reseñas para entrenamiento y 25 000 reseñas para prueba. Los conjuntos de entrenamiento y prueba están *equilibrados*, lo que significa que contienen la misma cantidad de reseñas positivas y negativas.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Descargar y explorar el conjunto de datos de IMDB\n", - "\n", - "Descarguemos y extraigamos los conjuntos de datos, luego, exploremos la estructura del directorio." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "Los directorios `aclImdb/train/pos` y `aclImdb/train/neg` contienen muchos archivos de texto, donde cada uno corresponde a una reseña de película. Echemos un vistazo a uno de ellos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Cargar el conjunto de datos\n", - "\n", - "A continuación, cargará los datos del disco y los preparará en un formato adecuado para el entrenamiento. Para esto, usará la práctica utilidad [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera una estructura de directorios como la que se muestra a continuación.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "Para preparar el conjunto de datos para clasificación binaria, necesita dos carpetas en el disco, que correspondan con `class_a` y `class_b`. Estas serán las reseñas positivas y negativas de las películas, que se pueden encontrar en `aclImdb/train/pos` y `aclImdb/train/neg`. Dado que el conjunto de datos de IMDB contiene carpetas adicionales, deberá eliminarlas antes de usar esta utilidad." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Luego, usará la utilidad `text_dataset_from_directory` para crear un `tf.data.Dataset` etiquetado. [tf.data](https://www.tensorflow.org/guide/data) es una potente colección de herramientas para trabajar con datos.\n", - "\n", - "A la hora de hacer un experimento de aprendizaje automático, lo mejor es dividir el conjunto de datos en tres partes: [entrenamiento](https://developers.google.com/machine-learning/glossary#training_set), [validación](https://developers.google.com/machine-learning/glossary#validation_set) y [prueba](https://developers.google.com/machine-learning/glossary#test-set).\n", - "\n", - "El conjunto de datos de IMDB ya está dividido en entrenamiento y prueba, pero no cuenta con un conjunto de validación. Creemos un conjunto de validación mediante una división 80:20 de los datos de entrenamiento con ayuda del argumento `validation_split` que se muestra a continuación." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "Como puede ver en el bloque de arriba, hay 25 000 ejemplos en la carpeta de entrenamiento, de lo que usará el 80 % (o 20 000) para entrenamiento. Como verá en un momento, puede entrenar un modelo pasando un conjunto de datos directamente a `model.fit`. Si es la primera vez que usa `tf.data`, también puede iterar el conjunto de datos e imprimir algunos ejemplos como se muestra a continuación." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Tenga en cuenta que las reseñas contienen texto bruto (con puntuación y algunas etiquetas HTML como `
`). En la siguiente sección le mostraremos cómo debe manejar esto.\n", - "\n", - "Las etiquetas son 0 o 1. Para ver cuál corresponde a las reseñas positivas y negativas de las películas, puede consultar la propiedad `class_names` en el conjunto de datos.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "A continuación, creará un conjunto de datos de validación y prueba. Usará las 5000 reseñas restantes del conjunto de entrenamiento para ejecutar la validación." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Nota: Cuando use los argumentos `validation_split` y `subset`, asegúrese de especificar una semilla o de pasar `shuffle=False`, para que las fracciones de validación y entrenamiento no se superpongan." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Preparar el conjunto de datos para entrenamiento\n", - "\n", - "A continuación, usará la útil capa `tf.keras.layers.TextVectorization` para estndarizar, tokenizar y vectorizar los datos.\n", - "\n", - "El término estandarización se refiere al preprocesamiento del texto que generalmente se utiliza para eliminar la puntuación o los elementos de HTML con el objetivo de simplificar el conjunto de datos. Tokenizar en este contexto es dividir las cadenas en tokens (por ejemplo, separar una frase en palabras individuales, usando los espacios en blanco para separar). La vetorización se refiere al proceso mediante el cual los tokens se convierten en números que se pueden cargar a la red neuronal. Todas estas tareas se pueden completar con esta capa.\n", - "\n", - "Como pudo ver anteriormente, las reseñas contienen varias etiquetas HTML como `
`. El estandarizador predeterminado de la capa `TextVectorization` (que convierte texto a minúsculas y elimina la puntuación de forma predeterminada, pero no elimina los elementos de HTML) no eliminará estas etiquetas. Deberá escribir una función de estandarización personalizada para eliminar el HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Nota: Para evitar el [sesgo entrenamiento-prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (también conocido como sesgo entrenamiento-servicio), es importante preprocesar los datos de forma idéntica tanto durante el entrenamiento como en la etapa de prueba. Para simplificar esto, la capa `TextVectorization` se puede incluir directamente dentro del modelo, como se muestra más adelante en este tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Luego, creará una capa `TextVectorization`. Usará esta capa para estandarizar, tokenizar y vectorizar nuestros datos. Configurará `output_mode` en `int` para crear índices enteros únicos para cada token.\n", - "\n", - "Tenga en cuenta que está usando la función de separación predeterminada y la función de estandarización personalizada que definió anteriormente. También deberá definir algunas constantes para el modelo, como un valor máximo explícito de `sequence_length`, que hará que cada capa amortigüe o trunque las secuencias exactamente a los valores `sequence_length`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "A continuación, llamará `adapt` para que ajuste el estado de la capa de preprocesamiento al conjunto de datos. Esto hará que el modelo convierta un índice de cadenas a enteros." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Nota: Es importante que solo use sus datos de entrenamiento para al llamar adapt (si usa el conjunto de prueba, se podría filtrar información)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Creemos una función para ver los resultados del uso de esta capa para preprocesar algunos datos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "Como pudo ver anteriormente, cada token ha sido reemplazo por un entero. Puede buscar el token (cadena) al que corresponde cada entero llamando `.get_vocabulary()` en la capa." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "Ya está casi listo para entrenar su modelo. Como último paso de preprocesamiento, debe aplicar la capa TextVectorization que creó anteriormente a los conjuntos de datos de entrenamiento, validación y prueba." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configurar el conjunto de datos para mejorar el rendimiento\n", - "\n", - "Hay dos métodos importantes que debería usar al cargar los datos para asegurarse de que la E/S no se bloquee.\n", - "\n", - "`.cache()` conserva los datos en la memoria después de que descarga del disco. Esto evitará que el conjunto de datos se transforme en un cuello de botella mientras entrena su modelo. Si su conjunto de datos es demasiado grande para caber en la memoria, también puede usar este método para crear un potente caché en disco, que se lee de forma más eficiente que muchos archivos pequeños.\n", - "\n", - "`.prefetch()` superpone el preprocesamiento de los datos y la ejecución del modelo durante el entrenamiento.\n", - "\n", - "Puede obtener más información sobre ambos métodos y sobre cómo almacenar datos en caché en disco en la [guía de rendimiento de datos](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Crear el modelo\n", - "\n", - "Llegó la hora de que cree su red neuronal:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "Las capas se apilan secuencialmente para generar el clasificador:\n", - "\n", - "1. La primera capa es una capa `Embedding`. Esta capa toma las reseñas cifradas con números enteros y busca un vector de incorporación para cada índice de palabra. Estos vectores se aprenden a medida que se entrena el modelo. Los vectores agregan una dimensión al arreglo de salida. Las dimensiones resultantes son las siguientes: `(batch, sequence, embedding)`. Para obtener más información sobre las incorporaciones, consulte el tutorial [Incorporaciones de palabras](https://www.tensorflow.org/text/guide/word_embeddings).\n", - "2. A continuación, una capa `GlobalAveragePooling1D` devuelve un vector de salida de longitud fija para cada ejemplo calculando el promedio sobre la dimensión de la secuencia. Esto le permite a modelo manejar entradas de longitud variable, de la forma más sencilla posible.\n", - "3. La última capa está densamente conectada con un único nodo de salida." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Función de pérdida y optimizador\n", - "\n", - "Un modelo necesita una función de pérdida y un optimizador para el entrenamiento. Dado que este es un problema de clasificación binaria y el modelo genera una probabilidad (una capa de una sola unidad con una activación sigmoide), usaremos la función de pérdida `losses.BinaryCrossentropy`.\n", - "\n", - "Ahora, configure el modelo para usar un optimizador y una función de pérdida:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Entrenar el modelo\n", - "\n", - "Entrenará el modelo pasando el objeto `dataset` al método fit." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Evaluar el modelo\n", - "\n", - "Veamos el rendimiento del modelo. Nos devolverá dos valores; la pérdida (un número que representa nuestro error, los valores bajos son mejores) y la precisión." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "Este enfoque bastante sencillo alcanza una precisión del 86 %." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Cree un gráfico de precisión y pérdida a lo largo del tiempo\n", - "\n", - "`model.fit()` devuelve un objeto `History` que contiene un diccionario con todo lo que pasó durante el entrenamiento:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "Hay cuatro entradas: una por cada métrica que se monitoreó durante el entrenamiento y la validación. Puede usarlas para trazar la pérdida de entrenamiento y validación para compararlas, puede hacer lo mismo con la precisión:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "En este gráfico, los puntos representan la pérdida y la precisión del entrenamiento y las líneas continuas reflejan la pérdida y la precisión de la validación.\n", - "\n", - "Como puede ver, la pérdida del entrenamiento *se reduce* época tras época y la precisión del entrenamiento *aumenta* a medida que pasan las épocas. Esto es lo que suele pasar cuando se usa una optimización con descenso de gradiente, debe reducir al mínimo la cantidad deseada en cada iteración.\n", - "\n", - "Esto no es lo que sucede en el caso de la pérdida y la precisión de la validación, al parecer llegan a su punto máximo antes que la precisión del entrenamiento. Este es un ejemplo de sobreajuste: el modelo funciona mejor con los datos de entrenamiento que con los datos que no ha visto anteriormente. Pasado este punto, el modelo se sobreoptimiza y aprende representaciones *específicas* de los datos de entrenamiento que no se *generalizan* a los datos de prueba.\n", - "\n", - "En este caso particular, podría evitar el sobreajuste con tan solo detener el entrenamiento cuando la precisión de validación deje de aumentar. Una forma de hacerlo es con la retrollamada `tf.keras.callbacks.EarlyStopping`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Exportar el modelo\n", - "\n", - "En el código que vimos arriba, se aplicó la capa `TextVectorization` al conjunto de datos antes de cargar texto al modelo. Si desea que su modelo sea capaz de procesar cadenas sin procesar (por ejemplo, para simplificar la implementación), puede incluir la capa `TextVectorization` en su modelo. Para ello, puede crear un nuevo modelo a partir de los pesos que acaba de entrenar." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inferencia en los nuevos datos\n", - "\n", - "Para obtener predicciones para ejemplos nuevos, puede sencillamente llamar `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Incluir la lógica de preprocesamiento de textos en su modelo le permitirá exportar un modelo para producción que simplifique la implementación y reduzca la probabilidad de que se produzca un [sesgo entre entrenamiento y prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "Hay una diferencia de rendimiento que tenemos que tener en cuenta a la hora de elegir dónde aplicar la capa TextVectorization. Usarla fuera de su modelo le permite hacer un procesamiento asíncrono en CPU y almacenar en búfer los datos cuando se entrena en GPU. Por lo tanto, si está entrenando su modelo en GPU, probablemente debería elegir esta opción para obtener el mejor rendimiento mientras desarrolla su modelo, y luego cambiar para incluir la capa TextVectorization dentro de su modelo cuando esté listo para prepararse para la implementación.\n", - "\n", - "Visite este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para obtener más información sobre cómo guardar modelos." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Ejercicio: clasificación multiclase en preguntas de Stack Overflow\n", - "\n", - "En este tutorial, le mostramos cómo entrenar un clasificador binario desde cero con los conjuntos de datos de IMDB. A modo de ejercicio práctico, puede modificar este bloc de notas para entrenar un clasificador multiclase para predecir la etiqueta de una pregunta de programación en [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "Le preparamos un [conjunto de datos](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que contiene el cuerpo de varios miles de preguntas de programación, (por ejemplo, \"¿Como puedo ordenar un diccionario por valor en Python?\") que se publicaron en Stack Overflow. Cada una de ellas se etiquetó con una sola etiqueta (que puede ser Python, CSharp, JavaScript o Java). Su tarea consiste en tomar una pregunta como entrada y predecir la etiqueta correspondiente, en este caso, Python.\n", - "\n", - "El conjunto de datos con el que trabajará contiene miles de preguntas que fueron extraídas del conjunto de datos público de Stack Overflow en [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), que es mucho más grande y contiene más de 17 millones de publicaciones.\n", - "\n", - "Tras descargar el conjunto de datos, verá que tiene una estructura de directorio similar al conjunto de datos de IMDB con el que trabajó anteriormente:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Nota: Para elevar el nivel de dificultad del problema de clasificación, las apariciones de las palabras Python, CSharp, JavaScript o Java en las preguntas de programación han sido reemplazadas por las palabras *en blanco* (ya que muchas preguntas mencionan el lenguaje al que se refieren).\n", - "\n", - "Para completar este ejercicio, debería modificar este bloc de notas para trabajar con el conjunto de datos de Stack Overflow aplicando los siguientes cambios:\n", - "\n", - "1. En la parte superior del bloc de notas, actualice el código que descarga el conjunto de datos de IMDB con el código para descargar el [conjunto de datos de Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que ya ha sido preparado. Como el conjunto de datos de Stack Overflow tiene una estructura de directorio similar, no será necesario que realice muchas modificaciones.\n", - "\n", - "2. Modifique la última capa de su modelo para que sea `Dense(4)`, ya que ahora son cuatro las clases de salida.\n", - "\n", - "3. Cuando compile el modelo, cambie la pérdida a `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta es la función de pérdida adecuada para usar con un problema de clasificación multiclase, cuando las etiquetas de cada clase son números enteros (en este caso, pueden ser 0, *1*, *2* o *3*). Además, cambie las métricas a `metrics=['accuracy']`, ya que este es un problema de clasificación multiclase (`tf.metrics.BinaryAccuracy` se usa solamente para clasificadores binarios).\n", - "\n", - "4. A la hora de trazar la precisión a lo largo del tiempo, cambie `binary_accuracy` y `val_binary_accuracy` por `accuracy` y `val_accuracy`, respectivamente.\n", - "\n", - "5. Una vez que haya hecho todos estos cambios, estará listo para entrenar un clasificador multiclase. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Más información\n", - "\n", - "En este tutorial, le presentamos la clasificación de textos desde cero. Para obtener más información sobre el flujo de trabajo de la clasificación de textos en términos generales, consulte la [guía Clasificación de textos](https://developers.google.com/machine-learning/guides/text-classification/) de Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Clasificación básica de textos" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
Ver en TensorFlow.orgEjecutar en Google Colab Ver fuente en GitHub Descargar notebook
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "En este tutorial se muestra la clasificación de textos a partir de archivos de texto plano almacenados en un disco. Entrenará un clasificador binario para que analice los sentimientos de un conjunto de datos de IMDB. Al final del bloc de notas, hay un ejercicio para que lo ponga a prueba, en el que entrenará un clasificador multiclase para predecir la etiqueta de una pregunta de programación de Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Análisis de sentimientos\n", + "\n", + "En este bloc de notas se entrena un modelo de análisis de sentimiento para clasificar reseñas de películas como *positivas* o *negativas* a partir del texto de la reseña. Este es un ejemplo de clasificación *binaria* (o de dos clases), un tipo de problema de aprendizaje automático importante y ampliamente aplicable.\n", + "\n", + "Usará los [enormes conjuntos de datos de reseñas de películas](https://ai.stanford.edu/~amaas/data/sentiment/) que contienen el texto de 50 000 reseñas de películas de [Internet Movie Database](https://www.imdb.com/). Se divide en 25 000 reseñas para entrenamiento y 25 000 reseñas para prueba. Los conjuntos de entrenamiento y prueba están *equilibrados*, lo que significa que contienen la misma cantidad de reseñas positivas y negativas.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Descargar y explorar el conjunto de datos de IMDB\n", + "\n", + "Descarguemos y extraigamos los conjuntos de datos, luego, exploremos la estructura del directorio." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "Los directorios `aclImdb/train/pos` y `aclImdb/train/neg` contienen muchos archivos de texto, donde cada uno corresponde a una reseña de película. Echemos un vistazo a uno de ellos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Cargar el conjunto de datos\n", + "\n", + "A continuación, cargará los datos del disco y los preparará en un formato adecuado para el entrenamiento. Para esto, usará la práctica utilidad [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera una estructura de directorios como la que se muestra a continuación.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "Para preparar el conjunto de datos para clasificación binaria, necesita dos carpetas en el disco, que correspondan con `class_a` y `class_b`. Estas serán las reseñas positivas y negativas de las películas, que se pueden encontrar en `aclImdb/train/pos` y `aclImdb/train/neg`. Dado que el conjunto de datos de IMDB contiene carpetas adicionales, deberá eliminarlas antes de usar esta utilidad." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Luego, usará la utilidad `text_dataset_from_directory` para crear un `tf.data.Dataset` etiquetado. [tf.data](https://www.tensorflow.org/guide/data) es una potente colección de herramientas para trabajar con datos.\n", + "\n", + "A la hora de hacer un experimento de aprendizaje automático, lo mejor es dividir el conjunto de datos en tres partes: [entrenamiento](https://developers.google.com/machine-learning/glossary#training_set), [validación](https://developers.google.com/machine-learning/glossary#validation_set) y [prueba](https://developers.google.com/machine-learning/glossary#test-set).\n", + "\n", + "El conjunto de datos de IMDB ya está dividido en entrenamiento y prueba, pero no cuenta con un conjunto de validación. Creemos un conjunto de validación mediante una división 80:20 de los datos de entrenamiento con ayuda del argumento `validation_split` que se muestra a continuación." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "Como puede ver en el bloque de arriba, hay 25 000 ejemplos en la carpeta de entrenamiento, de lo que usará el 80 % (o 20 000) para entrenamiento. Como verá en un momento, puede entrenar un modelo pasando un conjunto de datos directamente a `model.fit`. Si es la primera vez que usa `tf.data`, también puede iterar el conjunto de datos e imprimir algunos ejemplos como se muestra a continuación." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Tenga en cuenta que las reseñas contienen texto bruto (con puntuación y algunas etiquetas HTML como `
`). En la siguiente sección le mostraremos cómo debe manejar esto.\n", + "\n", + "Las etiquetas son 0 o 1. Para ver cuál corresponde a las reseñas positivas y negativas de las películas, puede consultar la propiedad `class_names` en el conjunto de datos.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "A continuación, creará un conjunto de datos de validación y prueba. Usará las 5000 reseñas restantes del conjunto de entrenamiento para ejecutar la validación." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Nota: Cuando use los argumentos `validation_split` y `subset`, asegúrese de especificar una semilla o de pasar `shuffle=False`, para que las fracciones de validación y entrenamiento no se superpongan." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Preparar el conjunto de datos para entrenamiento\n", + "\n", + "A continuación, usará la útil capa `tf.keras.layers.TextVectorization` para estndarizar, tokenizar y vectorizar los datos.\n", + "\n", + "El término estandarización se refiere al preprocesamiento del texto que generalmente se utiliza para eliminar la puntuación o los elementos de HTML con el objetivo de simplificar el conjunto de datos. Tokenizar en este contexto es dividir las cadenas en tokens (por ejemplo, separar una frase en palabras individuales, usando los espacios en blanco para separar). La vetorización se refiere al proceso mediante el cual los tokens se convierten en números que se pueden cargar a la red neuronal. Todas estas tareas se pueden completar con esta capa.\n", + "\n", + "Como pudo ver anteriormente, las reseñas contienen varias etiquetas HTML como `
`. El estandarizador predeterminado de la capa `TextVectorization` (que convierte texto a minúsculas y elimina la puntuación de forma predeterminada, pero no elimina los elementos de HTML) no eliminará estas etiquetas. Deberá escribir una función de estandarización personalizada para eliminar el HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Nota: Para evitar el [sesgo entrenamiento-prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (también conocido como sesgo entrenamiento-servicio), es importante preprocesar los datos de forma idéntica tanto durante el entrenamiento como en la etapa de prueba. Para simplificar esto, la capa `TextVectorization` se puede incluir directamente dentro del modelo, como se muestra más adelante en este tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Luego, creará una capa `TextVectorization`. Usará esta capa para estandarizar, tokenizar y vectorizar nuestros datos. Configurará `output_mode` en `int` para crear índices enteros únicos para cada token.\n", + "\n", + "Tenga en cuenta que está usando la función de separación predeterminada y la función de estandarización personalizada que definió anteriormente. También deberá definir algunas constantes para el modelo, como un valor máximo explícito de `sequence_length`, que hará que cada capa amortigüe o trunque las secuencias exactamente a los valores `sequence_length`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "A continuación, llamará `adapt` para que ajuste el estado de la capa de preprocesamiento al conjunto de datos. Esto hará que el modelo convierta un índice de cadenas a enteros." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Nota: Es importante que solo use sus datos de entrenamiento para al llamar adapt (si usa el conjunto de prueba, se podría filtrar información)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Creemos una función para ver los resultados del uso de esta capa para preprocesar algunos datos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "Como pudo ver anteriormente, cada token ha sido reemplazo por un entero. Puede buscar el token (cadena) al que corresponde cada entero llamando `.get_vocabulary()` en la capa." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "Ya está casi listo para entrenar su modelo. Como último paso de preprocesamiento, debe aplicar la capa TextVectorization que creó anteriormente a los conjuntos de datos de entrenamiento, validación y prueba." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configurar el conjunto de datos para mejorar el rendimiento\n", + "\n", + "Hay dos métodos importantes que debería usar al cargar los datos para asegurarse de que la E/S no se bloquee.\n", + "\n", + "`.cache()` conserva los datos en la memoria después de que descarga del disco. Esto evitará que el conjunto de datos se transforme en un cuello de botella mientras entrena su modelo. Si su conjunto de datos es demasiado grande para caber en la memoria, también puede usar este método para crear un potente caché en disco, que se lee de forma más eficiente que muchos archivos pequeños.\n", + "\n", + "`.prefetch()` superpone el preprocesamiento de los datos y la ejecución del modelo durante el entrenamiento.\n", + "\n", + "Puede obtener más información sobre ambos métodos y sobre cómo almacenar datos en caché en disco en la [guía de rendimiento de datos](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Crear el modelo\n", + "\n", + "Llegó la hora de que cree su red neuronal:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "Las capas se apilan secuencialmente para generar el clasificador:\n", + "\n", + "1. La primera capa es una capa `Embedding`. Esta capa toma las reseñas cifradas con números enteros y busca un vector de incorporación para cada índice de palabra. Estos vectores se aprenden a medida que se entrena el modelo. Los vectores agregan una dimensión al arreglo de salida. Las dimensiones resultantes son las siguientes: `(batch, sequence, embedding)`. Para obtener más información sobre las incorporaciones, consulte el tutorial [Incorporaciones de palabras](https://www.tensorflow.org/text/guide/word_embeddings).\n", + "2. A continuación, una capa `GlobalAveragePooling1D` devuelve un vector de salida de longitud fija para cada ejemplo calculando el promedio sobre la dimensión de la secuencia. Esto le permite a modelo manejar entradas de longitud variable, de la forma más sencilla posible.\n", + "3. La última capa está densamente conectada con un único nodo de salida." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Función de pérdida y optimizador\n", + "\n", + "Un modelo necesita una función de pérdida y un optimizador para el entrenamiento. Dado que este es un problema de clasificación binaria y el modelo genera una probabilidad (una capa de una sola unidad con una activación sigmoide), usaremos la función de pérdida `losses.BinaryCrossentropy`.\n", + "\n", + "Ahora, configure el modelo para usar un optimizador y una función de pérdida:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Entrenar el modelo\n", + "\n", + "Entrenará el modelo pasando el objeto `dataset` al método fit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Evaluar el modelo\n", + "\n", + "Veamos el rendimiento del modelo. Nos devolverá dos valores; la pérdida (un número que representa nuestro error, los valores bajos son mejores) y la precisión." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "Este enfoque bastante sencillo alcanza una precisión del 86 %." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Cree un gráfico de precisión y pérdida a lo largo del tiempo\n", + "\n", + "`model.fit()` devuelve un objeto `History` que contiene un diccionario con todo lo que pasó durante el entrenamiento:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "Hay cuatro entradas: una por cada métrica que se monitoreó durante el entrenamiento y la validación. Puede usarlas para trazar la pérdida de entrenamiento y validación para compararlas, puede hacer lo mismo con la precisión:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "En este gráfico, los puntos representan la pérdida y la precisión del entrenamiento y las líneas continuas reflejan la pérdida y la precisión de la validación.\n", + "\n", + "Como puede ver, la pérdida del entrenamiento *se reduce* época tras época y la precisión del entrenamiento *aumenta* a medida que pasan las épocas. Esto es lo que suele pasar cuando se usa una optimización con descenso de gradiente, debe reducir al mínimo la cantidad deseada en cada iteración.\n", + "\n", + "Esto no es lo que sucede en el caso de la pérdida y la precisión de la validación, al parecer llegan a su punto máximo antes que la precisión del entrenamiento. Este es un ejemplo de sobreajuste: el modelo funciona mejor con los datos de entrenamiento que con los datos que no ha visto anteriormente. Pasado este punto, el modelo se sobreoptimiza y aprende representaciones *específicas* de los datos de entrenamiento que no se *generalizan* a los datos de prueba.\n", + "\n", + "En este caso particular, podría evitar el sobreajuste con tan solo detener el entrenamiento cuando la precisión de validación deje de aumentar. Una forma de hacerlo es con la retrollamada `tf.keras.callbacks.EarlyStopping`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Exportar el modelo\n", + "\n", + "En el código que vimos arriba, se aplicó la capa `TextVectorization` al conjunto de datos antes de cargar texto al modelo. Si desea que su modelo sea capaz de procesar cadenas sin procesar (por ejemplo, para simplificar la implementación), puede incluir la capa `TextVectorization` en su modelo. Para ello, puede crear un nuevo modelo a partir de los pesos que acaba de entrenar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inferencia en los nuevos datos\n", + "\n", + "Para obtener predicciones para ejemplos nuevos, puede sencillamente llamar `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Incluir la lógica de preprocesamiento de textos en su modelo le permitirá exportar un modelo para producción que simplifique la implementación y reduzca la probabilidad de que se produzca un [sesgo entre entrenamiento y prueba](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "Hay una diferencia de rendimiento que tenemos que tener en cuenta a la hora de elegir dónde aplicar la capa TextVectorization. Usarla fuera de su modelo le permite hacer un procesamiento asíncrono en CPU y almacenar en búfer los datos cuando se entrena en GPU. Por lo tanto, si está entrenando su modelo en GPU, probablemente debería elegir esta opción para obtener el mejor rendimiento mientras desarrolla su modelo, y luego cambiar para incluir la capa TextVectorization dentro de su modelo cuando esté listo para prepararse para la implementación.\n", + "\n", + "Visite este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para obtener más información sobre cómo guardar modelos." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Ejercicio: clasificación multiclase en preguntas de Stack Overflow\n", + "\n", + "En este tutorial, le mostramos cómo entrenar un clasificador binario desde cero con los conjuntos de datos de IMDB. A modo de ejercicio práctico, puede modificar este bloc de notas para entrenar un clasificador multiclase para predecir la etiqueta de una pregunta de programación en [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "Le preparamos un [conjunto de datos](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que contiene el cuerpo de varios miles de preguntas de programación, (por ejemplo, \"¿Como puedo ordenar un diccionario por valor en Python?\") que se publicaron en Stack Overflow. Cada una de ellas se etiquetó con una sola etiqueta (que puede ser Python, CSharp, JavaScript o Java). Su tarea consiste en tomar una pregunta como entrada y predecir la etiqueta correspondiente, en este caso, Python.\n", + "\n", + "El conjunto de datos con el que trabajará contiene miles de preguntas que fueron extraídas del conjunto de datos público de Stack Overflow en [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), que es mucho más grande y contiene más de 17 millones de publicaciones.\n", + "\n", + "Tras descargar el conjunto de datos, verá que tiene una estructura de directorio similar al conjunto de datos de IMDB con el que trabajó anteriormente:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Nota: Para elevar el nivel de dificultad del problema de clasificación, las apariciones de las palabras Python, CSharp, JavaScript o Java en las preguntas de programación han sido reemplazadas por las palabras *en blanco* (ya que muchas preguntas mencionan el lenguaje al que se refieren).\n", + "\n", + "Para completar este ejercicio, debería modificar este bloc de notas para trabajar con el conjunto de datos de Stack Overflow aplicando los siguientes cambios:\n", + "\n", + "1. En la parte superior del bloc de notas, actualice el código que descarga el conjunto de datos de IMDB con el código para descargar el [conjunto de datos de Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) que ya ha sido preparado. Como el conjunto de datos de Stack Overflow tiene una estructura de directorio similar, no será necesario que realice muchas modificaciones.\n", + "\n", + "2. Modifique la última capa de su modelo para que sea `Dense(4)`, ya que ahora son cuatro las clases de salida.\n", + "\n", + "3. Cuando compile el modelo, cambie la pérdida a `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta es la función de pérdida adecuada para usar con un problema de clasificación multiclase, cuando las etiquetas de cada clase son números enteros (en este caso, pueden ser 0, *1*, *2* o *3*). Además, cambie las métricas a `metrics=['accuracy']`, ya que este es un problema de clasificación multiclase (`tf.metrics.BinaryAccuracy` se usa solamente para clasificadores binarios).\n", + "\n", + "4. A la hora de trazar la precisión a lo largo del tiempo, cambie `binary_accuracy` y `val_binary_accuracy` por `accuracy` y `val_accuracy`, respectivamente.\n", + "\n", + "5. Una vez que haya hecho todos estos cambios, estará listo para entrenar un clasificador multiclase. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Más información\n", + "\n", + "En este tutorial, le presentamos la clasificación de textos desde cero. Para obtener más información sobre el flujo de trabajo de la clasificación de textos en términos generales, consulte la [guía Clasificación de textos](https://developers.google.com/machine-learning/guides/text-classification/) de Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/ja/tutorials/keras/text_classification.ipynb b/site/ja/tutorials/keras/text_classification.ipynb index 776ad320be..4448bf8d3d 100644 --- a/site/ja/tutorials/keras/text_classification.ipynb +++ b/site/ja/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 映画レビューのテキスト分類" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
TensorFlow.org で表示 Google Colab で実行 GitHub でソースを表示 ノートブックをダウンロード
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "このチュートリアルでは、ディスクに保存されているプレーンテキストファイルを使用してテキストを分類する方法について説明します。IMDB データセットでセンチメント分析を実行するように、二項分類器をトレーニングします。ノートブックの最後には、Stack Overflow のプログラミングに関する質問のタグを予測するためのマルチクラス分類器をトレーニングする演習があります。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## センチメント分析\n", - "\n", - "このノートブックでは、映画レビューのテキストを使用して、それが*肯定的*であるか*否定的*であるかに分類するようにセンチメント分析モデルをトレーニングします。これは*二項*分類の例で、機械学習問題では重要な分類法として広く適用されます。\n", - "\n", - "ここでは、[Internet Movie Database](https://ai.stanford.edu/~amaas/data/sentiment/) から抽出した 50,000 件の映画レビューを含む、[大規模なレビューデータセット](https://www.imdb.com/)を使います。レビューはトレーニング用とテスト用に 25,000 件ずつに分割されています。トレーニング用とテスト用のデータは均衡しています。言い換えると、それぞれが同数の肯定的及び否定的なレビューを含んでいます。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### IMDB データセットをダウンロードして調べる\n", - "\n", - "データセットをダウンロードして抽出してから、ディレクトリ構造を調べてみましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` および `aclImdb/train/neg` ディレクトリには多くのテキストファイルが含まれており、それぞれが 1 つの映画レビューです。それらの 1 つを見てみましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### データセットを読み込む\n", - "\n", - "次に、データをディスクから読み込み、トレーニングに適した形式に準備します。これを行うには、便利な [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) ユーティリティを使用します。このユーティリティは、次のようなディレクトリ構造を想定しています。\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "二項分類用のデータセットを準備するには、ディスクに `class_a` および `class_b`に対応する 2 つのフォルダが必要です。これらは、`aclImdb/train/pos` および `aclImdb/train/neg` にある肯定的および否定的な映画レビューになります。IMDB データセットには追加のフォルダーが含まれているため、このユーティリティを使用する前にそれらを削除します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "次に、`text_dataset_from_directory` ユーティリティを使用して、ラベル付きの `tf.data.Dataset` を作成します。[tf.data](https://www.tensorflow.org/guide/data) は、データを操作するための強力なツールのコレクションです。\n", - "\n", - "機械学習実験を実行するときは、データセットを[トレーニング](https://developers.google.com/machine-learning/glossary#training_set)、[検証](https://developers.google.com/machine-learning/glossary#validation_set)、および、[テスト](https://developers.google.com/machine-learning/glossary#test-set)の 3 つに分割することをお勧めします。\n", - "\n", - "IMDB データセットはすでにトレーニング用とテスト用に分割されていますが、検証セットはありません。以下の `validation_split` 引数を使用して、トレーニングデータの 80:20 分割を使用して検証セットを作成しましょう。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "上記のように、トレーニングフォルダには 25,000 の例があり、そのうち 80% (20,000) をトレーニングに使用します。以下に示すとおり、データセットを `model.fit` に直接渡すことで、モデルをトレーニングできます。`tf.data` を初めて使用する場合は、データセットを繰り返し処理して、次のようにいくつかの例を出力することもできます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "レビューには生のテキストが含まれていることに注意してください(句読点や `
` などのような HTML タグが付いていることもあります)。次のセクションでは、これらの処理方法を示します。\n", - "\n", - "ラベルは 0 または 1 です。これらのどれが肯定的および否定的な映画レビューに対応するかを確認するには、データセットの `class_names` プロパティを確認できます。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "次に、検証およびテスト用データセットを作成します。トレーニング用セットの残りの 5,000 件のレビューを検証に使用します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "注意: `validation_split` および `subset` 引数を使用する場合は、必ずランダムシードを指定するか、`shuffle=False` を渡して、検証とトレーニング分割に重複がないようにします。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### トレーニング用データセットを準備する\n", - "\n", - "次に、便利な `tf.keras.layers.TextVectorization` レイヤーを使用して、データを標準化、トークン化、およびベクトル化します。\n", - "\n", - "標準化とは、テキストを前処理することを指します。通常、句読点や HTML 要素を削除して、データセットを簡素化します。トークン化とは、文字列をトークンに分割することです (たとえば、空白で分割することにより、文を個々の単語に分割します)。ベクトル化とは、トークンを数値に変換して、ニューラルネットワークに入力できるようにすることです。これらのタスクはすべて、このレイヤーで実行できます。\n", - "\n", - "前述のとおり、レビューには `
` のようなさまざまな HTML タグが含まれています。これらのタグは、`TextVectorization` レイヤーのデフォルトの標準化機能によって削除されません (テキストを小文字に変換し、デフォルトで句読点を削除しますが、HTML は削除されません)。HTML を削除するカスタム標準化関数を作成します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "注意: [トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(トレーニング/サービングスキューとも呼ばれます)を防ぐには、トレーニング時とテスト時にデータを同じように前処理することが重要です。これを容易にするためには、このチュートリアルの後半で示すように、`TextVectorization` レイヤーをモデル内に直接含めます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "次に、`TextVectorization` レイヤーを作成します。このレイヤーを使用して、データを標準化、トークン化、およびベクトル化します。`output_mode` を `int` に設定して、トークンごとに一意の整数インデックスを作成します。\n", - "\n", - "デフォルトの分割関数と、上記で定義したカスタム標準化関数を使用していることに注意してください。また、明示的な最大値 `sequence_length` など、モデルの定数をいくつか定義します。これにより、レイヤーはシーケンスを正確に `sequence_length` 値にパディングまたは切り捨てます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "次に、`adapt` を呼び出して、前処理レイヤーの状態をデータセットに適合させます。これにより、モデルは文字列から整数へのインデックスを作成します。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "注意: Adapt を呼び出すときは、トレーニング用データのみを使用することが重要です(テスト用セットを使用すると情報が漏洩します)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "このレイヤーを使用して一部のデータを前処理した結果を確認する関数を作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "上記のように、各トークンは整数に置き換えられています。レイヤーで `.get_vocabulary()` を呼び出すことにより、各整数が対応するトークン(文字列)を検索できます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "モデルをトレーニングする準備がほぼ整いました。最後の前処理ステップとして、トレーニング、検証、およびデータセットのテストのために前に作成した TextVectorization レイヤーを適用します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### データセットを構成してパフォーマンスを改善する\n", - "\n", - "以下は、I/O がブロックされないようにするためにデータを読み込むときに使用する必要がある 2 つの重要な方法です。\n", - "\n", - "`.cache()` はデータをディスクから読み込んだ後、データをメモリに保持します。これにより、モデルのトレーニング中にデータセットがボトルネックになることを回避できます。データセットが大きすぎてメモリに収まらない場合は、この方法を使用して、パフォーマンスの高いオンディスクキャッシュを作成することもできます。これは、多くの小さなファイルを読み込むより効率的です。\n", - "\n", - "`.prefetch()` はトレーニング中にデータの前処理とモデルの実行をオーバーラップさせます。\n", - "\n", - "以上の 2 つの方法とデータをディスクにキャッシュする方法についての詳細は、[データパフォーマンスガイド](https://www.tensorflow.org/guide/data_performance)を参照してください。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### モデルを作成する\n", - "\n", - "ニューラルネットワークを作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "これらのレイヤーは、分類器を構成するため一列に積み重ねられます。\n", - "\n", - "1. 最初のレイヤーは `Embedding` (埋め込み)レイヤーです。このレイヤーは、整数にエンコードされた語彙を受け取り、それぞれの単語インデックスに対応する埋め込みベクトルを検索します。埋め込みベクトルは、モデルのトレーニングの中で学習されます。ベクトル化のために、出力行列には次元が1つ追加されます。その結果、次元は、`(batch, sequence, embedding)` となります。埋め込みの詳細については、[単語埋め込みチュートリアル](https://www.tensorflow.org/text/guide/word_embeddings)を参照してください。\n", - "2. 次は、`GlobalAveragePooling1D`(1次元のグローバル平均プーリング)レイヤーです。このレイヤーは、それぞれのサンプルについて、シーケンスの次元方向に平均値をもとめ、固定長のベクトルを返します。この結果、モデルは最も単純な形で、可変長の入力を扱うことができるようになります。\n", - "3. 最後のレイヤーは、単一の出力ノードと密に接続されています。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 損失関数とオプティマイザ\n", - "\n", - "モデルをトレーニングするには、損失関数とオプティマイザが必要です。これは二項分類問題であり、モデルは確率(シグモイドアクティベーションを持つ単一ユニットレイヤー)を出力するため、`losses.BinaryCrossentropy` 損失関数を使用します。\n", - "\n", - "損失関数の候補はこれだけではありません。例えば、`mean_squared_error`(平均二乗誤差)を使うこともできます。しかし、一般的には、確率を扱うには`binary_crossentropy`の方が適しています。`binary_crossentropy`は、確率分布の間の「距離」を測定する尺度です。今回の場合には、真の分布と予測値の分布の間の距離ということになります。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### モデルをトレーニングする\n", - "\n", - "`dataset` オブジェクトを fit メソッドに渡すことにより、モデルをトレーニングします。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### モデルを評価する\n", - "\n", - "モデルがどのように実行するか見てみましょう。2 つの値が返されます。損失(誤差、値が低いほど良)と正確度です。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "この、かなり素朴なアプローチでも 86% 前後の正解度を達成しました。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 経時的な正解度と損失のグラフを作成する\n", - "\n", - "`model.fit()` は、トレーニング中に発生したすべての情報を詰まったディクショナリを含む `History` オブジェクトを返します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "トレーニングと検証中に監視されている各メトリックに対して 1 つずつ、計 4 つのエントリがあります。このエントリを使用して、トレーニングと検証の損失とトレーニングと検証の正解度を比較したグラフを作成することができます。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "このグラフでは、点はトレーニングの損失と正解度を表し、実線は検証の損失と正解度を表します。\n", - "\n", - "トレーニングの損失がエポックごとに*下降*し、トレーニングの正解度がエポックごとに*上昇*していることに注目してください。これは、勾配下降最適化を使用しているときに見られる現象で、イテレーションごとに希望する量を最小化します。\n", - "\n", - "これは検証の損失と精度には当てはまりません。これらはトレーニング精度の前にピークに達しているようです。これが過適合の例で、モデルが、遭遇したことのないデータよりもトレーニングデータで優れたパフォーマンスを発揮する現象です。この後、モデルは過度に最適化し、テストデータに*一般化*しないトレーニングデータ*特有*の表現を学習します。\n", - "\n", - "この特定のケースでは、検証の正解度が向上しなくなったときにトレーニングを停止することにより、過適合を防ぐことができます。これを行うには、`tf.keras.callbacks.EarlyStopping` コールバックを使用することができます。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## モデルをエクスポートする\n", - "\n", - "上記のコードでは、モデルにテキストをフィードする前に、`TextVectorization` レイヤーをデータセットに適用しました。モデルで生の文字列を処理できるようにする場合 (たとえば、展開を簡素化するため)、モデル内に `TextVectorization` レイヤーを含めることができます。これを行うには、トレーニングしたばかりの重みを使用して新しいモデルを作成します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 新しいデータの推論\n", - "\n", - "新しい例の予測を取得するには、`model.predict()`を呼び出します。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "モデル内にテキスト前処理ロジックを含めると、モデルを本番環境にエクスポートして展開を簡素化し、[トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)の可能性を減らすことができます。\n", - "\n", - "TextVectorization レイヤーを適用する場所を選択する際に性能の違いに留意する必要があります。モデルの外部で使用すると、GPU でトレーニングするときに非同期 CPU 処理とデータのバッファリングを行うことができます。したがって、GPU でモデルをトレーニングしている場合は、モデルの開発中に最高のパフォーマンスを得るためにこのオプションを使用し、デプロイの準備ができたらモデル内に TextVectorization レイヤーを含めるように切り替えることをお勧めします。\n", - "\n", - "モデルの保存の詳細については、この[チュートリアル](https://www.tensorflow.org/tutorials/keras/save_and_load)にアクセスしてください。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 演習:StackOverflow の質問に対するマルチクラス分類\n", - "\n", - "このチュートリアルでは、IMDB データセットで二項分類器を最初からトレーニングする方法を示しました。演習として、このノートブックを変更して、[Stack Overflow](http://stackoverflow.com/) のプログラミング質問のタグを予測するマルチクラス分類器をトレーニングできます。\n", - "\n", - "Stack Overflow に投稿された数千のプログラミングに関する質問(たとえば、「Python でディクショナリを値で並べ替える方法」)の本文を含む[データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)が用意されています。それぞれ、1 つのタグ(Python、CSharp、JavaScript、または Java のいずれか)でラベル付けされています。この演習では、質問を入力として受け取り、適切なタグ(この場合は Python)を予測します。\n", - "\n", - "使用するデータセットには、1,700 万件以上の投稿を含む [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) の大規模な StackOverflow パブリックデータセットから抽出された数千の質問が含まれています。\n", - "\n", - "データセットをダウンロードすると、以前に使用した IMDB データセットと同様のディレクトリ構造になっていることがわかります。\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "注意: 分類問題の難易度を上げるために、プログラミングの質問での Python、CSharp、JavaScript、または Java という単語は、*blank* という単語に置き換えられました(多くの質問には、対象の言語が含まれているため)。\n", - "\n", - "この演習を完了するには、、このノートブックを変更してStackOverflow データセットを操作する必要があります。次の変更を行います。\n", - "\n", - "1. ノートブックの上部で、IMDB データセットをダウンロードするコードを、事前に準備されている [Stack Overflow データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)をダウンロードするコードで更新します。Stack Overflow データセットは同様のディレクトリ構造を持っているため、多くの変更を加える必要はありません。\n", - "\n", - "2. 4 つの出力クラスがあるため、モデルの最後のレイヤーを `Dense(4)` に変更します。\n", - "\n", - "3. モデルをコンパイルするときは、損失を `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)` に変更します。これは、各クラスのラベルが整数である場合に、マルチクラス分類問題に使用する正しい損失関数です。(この場合、 0、*1*、*2*、または 3 のいずれかになります)。さらに、これはマルチクラス分類の問題であるため、メトリックを `metrics=['accuracy']` に変更します (tf.metrics.BinaryAccuracy はバイナリ分類器にのみ使用されます)。\n", - "\n", - "4. 経時的な精度をプロットする場合は、`binary_accuracy` および `val_binary_accuracy`をそれぞれ `accuracy` および `val_accuracy` に変更します。\n", - "\n", - "5. これらの変更が完了すると、マルチクラス分類器をトレーニングできるようになります。 " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 詳細\n", - "\n", - "このチュートリアルでは、最初からテキスト分類を実行する方法を紹介しました。一般的なテキスト分類ワークフローの詳細については、Google Developers の[テキスト分類ガイド](https://developers.google.com/machine-learning/guides/text-classification/)をご覧ください。\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 映画レビューのテキスト分類" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
TensorFlow.org で表示 Google Colab で実行 GitHub でソースを表示 ノートブックをダウンロード
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "このチュートリアルでは、ディスクに保存されているプレーンテキストファイルを使用してテキストを分類する方法について説明します。IMDB データセットでセンチメント分析を実行するように、二項分類器をトレーニングします。ノートブックの最後には、Stack Overflow のプログラミングに関する質問のタグを予測するためのマルチクラス分類器をトレーニングする演習があります。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## センチメント分析\n", + "\n", + "このノートブックでは、映画レビューのテキストを使用して、それが*肯定的*であるか*否定的*であるかに分類するようにセンチメント分析モデルをトレーニングします。これは*二項*分類の例で、機械学習問題では重要な分類法として広く適用されます。\n", + "\n", + "ここでは、[Internet Movie Database](https://ai.stanford.edu/~amaas/data/sentiment/) から抽出した 50,000 件の映画レビューを含む、[大規模なレビューデータセット](https://www.imdb.com/)を使います。レビューはトレーニング用とテスト用に 25,000 件ずつに分割されています。トレーニング用とテスト用のデータは均衡しています。言い換えると、それぞれが同数の肯定的及び否定的なレビューを含んでいます。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### IMDB データセットをダウンロードして調べる\n", + "\n", + "データセットをダウンロードして抽出してから、ディレクトリ構造を調べてみましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` および `aclImdb/train/neg` ディレクトリには多くのテキストファイルが含まれており、それぞれが 1 つの映画レビューです。それらの 1 つを見てみましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### データセットを読み込む\n", + "\n", + "次に、データをディスクから読み込み、トレーニングに適した形式に準備します。これを行うには、便利な [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) ユーティリティを使用します。このユーティリティは、次のようなディレクトリ構造を想定しています。\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "二項分類用のデータセットを準備するには、ディスクに `class_a` および `class_b`に対応する 2 つのフォルダが必要です。これらは、`aclImdb/train/pos` および `aclImdb/train/neg` にある肯定的および否定的な映画レビューになります。IMDB データセットには追加のフォルダーが含まれているため、このユーティリティを使用する前にそれらを削除します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "次に、`text_dataset_from_directory` ユーティリティを使用して、ラベル付きの `tf.data.Dataset` を作成します。[tf.data](https://www.tensorflow.org/guide/data) は、データを操作するための強力なツールのコレクションです。\n", + "\n", + "機械学習実験を実行するときは、データセットを[トレーニング](https://developers.google.com/machine-learning/glossary#training_set)、[検証](https://developers.google.com/machine-learning/glossary#validation_set)、および、[テスト](https://developers.google.com/machine-learning/glossary#test-set)の 3 つに分割することをお勧めします。\n", + "\n", + "IMDB データセットはすでにトレーニング用とテスト用に分割されていますが、検証セットはありません。以下の `validation_split` 引数を使用して、トレーニングデータの 80:20 分割を使用して検証セットを作成しましょう。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "上記のように、トレーニングフォルダには 25,000 の例があり、そのうち 80% (20,000) をトレーニングに使用します。以下に示すとおり、データセットを `model.fit` に直接渡すことで、モデルをトレーニングできます。`tf.data` を初めて使用する場合は、データセットを繰り返し処理して、次のようにいくつかの例を出力することもできます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "レビューには生のテキストが含まれていることに注意してください(句読点や `
` などのような HTML タグが付いていることもあります)。次のセクションでは、これらの処理方法を示します。\n", + "\n", + "ラベルは 0 または 1 です。これらのどれが肯定的および否定的な映画レビューに対応するかを確認するには、データセットの `class_names` プロパティを確認できます。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "次に、検証およびテスト用データセットを作成します。トレーニング用セットの残りの 5,000 件のレビューを検証に使用します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "注意: `validation_split` および `subset` 引数を使用する場合は、必ずランダムシードを指定するか、`shuffle=False` を渡して、検証とトレーニング分割に重複がないようにします。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### トレーニング用データセットを準備する\n", + "\n", + "次に、便利な `tf.keras.layers.TextVectorization` レイヤーを使用して、データを標準化、トークン化、およびベクトル化します。\n", + "\n", + "標準化とは、テキストを前処理することを指します。通常、句読点や HTML 要素を削除して、データセットを簡素化します。トークン化とは、文字列をトークンに分割することです (たとえば、空白で分割することにより、文を個々の単語に分割します)。ベクトル化とは、トークンを数値に変換して、ニューラルネットワークに入力できるようにすることです。これらのタスクはすべて、このレイヤーで実行できます。\n", + "\n", + "前述のとおり、レビューには `
` のようなさまざまな HTML タグが含まれています。これらのタグは、`TextVectorization` レイヤーのデフォルトの標準化機能によって削除されません (テキストを小文字に変換し、デフォルトで句読点を削除しますが、HTML は削除されません)。HTML を削除するカスタム標準化関数を作成します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "注意: [トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(トレーニング/サービングスキューとも呼ばれます)を防ぐには、トレーニング時とテスト時にデータを同じように前処理することが重要です。これを容易にするためには、このチュートリアルの後半で示すように、`TextVectorization` レイヤーをモデル内に直接含めます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "次に、`TextVectorization` レイヤーを作成します。このレイヤーを使用して、データを標準化、トークン化、およびベクトル化します。`output_mode` を `int` に設定して、トークンごとに一意の整数インデックスを作成します。\n", + "\n", + "デフォルトの分割関数と、上記で定義したカスタム標準化関数を使用していることに注意してください。また、明示的な最大値 `sequence_length` など、モデルの定数をいくつか定義します。これにより、レイヤーはシーケンスを正確に `sequence_length` 値にパディングまたは切り捨てます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "次に、`adapt` を呼び出して、前処理レイヤーの状態をデータセットに適合させます。これにより、モデルは文字列から整数へのインデックスを作成します。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "注意: Adapt を呼び出すときは、トレーニング用データのみを使用することが重要です(テスト用セットを使用すると情報が漏洩します)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "このレイヤーを使用して一部のデータを前処理した結果を確認する関数を作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "上記のように、各トークンは整数に置き換えられています。レイヤーで `.get_vocabulary()` を呼び出すことにより、各整数が対応するトークン(文字列)を検索できます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "モデルをトレーニングする準備がほぼ整いました。最後の前処理ステップとして、トレーニング、検証、およびデータセットのテストのために前に作成した TextVectorization レイヤーを適用します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### データセットを構成してパフォーマンスを改善する\n", + "\n", + "以下は、I/O がブロックされないようにするためにデータを読み込むときに使用する必要がある 2 つの重要な方法です。\n", + "\n", + "`.cache()` はデータをディスクから読み込んだ後、データをメモリに保持します。これにより、モデルのトレーニング中にデータセットがボトルネックになることを回避できます。データセットが大きすぎてメモリに収まらない場合は、この方法を使用して、パフォーマンスの高いオンディスクキャッシュを作成することもできます。これは、多くの小さなファイルを読み込むより効率的です。\n", + "\n", + "`.prefetch()` はトレーニング中にデータの前処理とモデルの実行をオーバーラップさせます。\n", + "\n", + "以上の 2 つの方法とデータをディスクにキャッシュする方法についての詳細は、[データパフォーマンスガイド](https://www.tensorflow.org/guide/data_performance)を参照してください。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### モデルを作成する\n", + "\n", + "ニューラルネットワークを作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "これらのレイヤーは、分類器を構成するため一列に積み重ねられます。\n", + "\n", + "1. 最初のレイヤーは `Embedding` (埋め込み)レイヤーです。このレイヤーは、整数にエンコードされた語彙を受け取り、それぞれの単語インデックスに対応する埋め込みベクトルを検索します。埋め込みベクトルは、モデルのトレーニングの中で学習されます。ベクトル化のために、出力行列には次元が1つ追加されます。その結果、次元は、`(batch, sequence, embedding)` となります。埋め込みの詳細については、[単語埋め込みチュートリアル](https://www.tensorflow.org/text/guide/word_embeddings)を参照してください。\n", + "2. 次は、`GlobalAveragePooling1D`(1次元のグローバル平均プーリング)レイヤーです。このレイヤーは、それぞれのサンプルについて、シーケンスの次元方向に平均値をもとめ、固定長のベクトルを返します。この結果、モデルは最も単純な形で、可変長の入力を扱うことができるようになります。\n", + "3. 最後のレイヤーは、単一の出力ノードと密に接続されています。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 損失関数とオプティマイザ\n", + "\n", + "モデルをトレーニングするには、損失関数とオプティマイザが必要です。これは二項分類問題であり、モデルは確率(シグモイドアクティベーションを持つ単一ユニットレイヤー)を出力するため、`losses.BinaryCrossentropy` 損失関数を使用します。\n", + "\n", + "損失関数の候補はこれだけではありません。例えば、`mean_squared_error`(平均二乗誤差)を使うこともできます。しかし、一般的には、確率を扱うには`binary_crossentropy`の方が適しています。`binary_crossentropy`は、確率分布の間の「距離」を測定する尺度です。今回の場合には、真の分布と予測値の分布の間の距離ということになります。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### モデルをトレーニングする\n", + "\n", + "`dataset` オブジェクトを fit メソッドに渡すことにより、モデルをトレーニングします。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### モデルを評価する\n", + "\n", + "モデルがどのように実行するか見てみましょう。2 つの値が返されます。損失(誤差、値が低いほど良)と正確度です。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "この、かなり素朴なアプローチでも 86% 前後の正解度を達成しました。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 経時的な正解度と損失のグラフを作成する\n", + "\n", + "`model.fit()` は、トレーニング中に発生したすべての情報を詰まったディクショナリを含む `History` オブジェクトを返します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "トレーニングと検証中に監視されている各メトリックに対して 1 つずつ、計 4 つのエントリがあります。このエントリを使用して、トレーニングと検証の損失とトレーニングと検証の正解度を比較したグラフを作成することができます。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "このグラフでは、点はトレーニングの損失と正解度を表し、実線は検証の損失と正解度を表します。\n", + "\n", + "トレーニングの損失がエポックごとに*下降*し、トレーニングの正解度がエポックごとに*上昇*していることに注目してください。これは、勾配下降最適化を使用しているときに見られる現象で、イテレーションごとに希望する量を最小化します。\n", + "\n", + "これは検証の損失と精度には当てはまりません。これらはトレーニング精度の前にピークに達しているようです。これが過適合の例で、モデルが、遭遇したことのないデータよりもトレーニングデータで優れたパフォーマンスを発揮する現象です。この後、モデルは過度に最適化し、テストデータに*一般化*しないトレーニングデータ*特有*の表現を学習します。\n", + "\n", + "この特定のケースでは、検証の正解度が向上しなくなったときにトレーニングを停止することにより、過適合を防ぐことができます。これを行うには、`tf.keras.callbacks.EarlyStopping` コールバックを使用することができます。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## モデルをエクスポートする\n", + "\n", + "上記のコードでは、モデルにテキストをフィードする前に、`TextVectorization` レイヤーをデータセットに適用しました。モデルで生の文字列を処理できるようにする場合 (たとえば、展開を簡素化するため)、モデル内に `TextVectorization` レイヤーを含めることができます。これを行うには、トレーニングしたばかりの重みを使用して新しいモデルを作成します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 新しいデータの推論\n", + "\n", + "新しい例の予測を取得するには、`model.predict()`を呼び出します。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "モデル内にテキスト前処理ロジックを含めると、モデルを本番環境にエクスポートして展開を簡素化し、[トレーニング/テストスキュー](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)の可能性を減らすことができます。\n", + "\n", + "TextVectorization レイヤーを適用する場所を選択する際に性能の違いに留意する必要があります。モデルの外部で使用すると、GPU でトレーニングするときに非同期 CPU 処理とデータのバッファリングを行うことができます。したがって、GPU でモデルをトレーニングしている場合は、モデルの開発中に最高のパフォーマンスを得るためにこのオプションを使用し、デプロイの準備ができたらモデル内に TextVectorization レイヤーを含めるように切り替えることをお勧めします。\n", + "\n", + "モデルの保存の詳細については、この[チュートリアル](https://www.tensorflow.org/tutorials/keras/save_and_load)にアクセスしてください。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 演習:StackOverflow の質問に対するマルチクラス分類\n", + "\n", + "このチュートリアルでは、IMDB データセットで二項分類器を最初からトレーニングする方法を示しました。演習として、このノートブックを変更して、[Stack Overflow](http://stackoverflow.com/) のプログラミング質問のタグを予測するマルチクラス分類器をトレーニングできます。\n", + "\n", + "Stack Overflow に投稿された数千のプログラミングに関する質問(たとえば、「Python でディクショナリを値で並べ替える方法」)の本文を含む[データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)が用意されています。それぞれ、1 つのタグ(Python、CSharp、JavaScript、または Java のいずれか)でラベル付けされています。この演習では、質問を入力として受け取り、適切なタグ(この場合は Python)を予測します。\n", + "\n", + "使用するデータセットには、1,700 万件以上の投稿を含む [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) の大規模な StackOverflow パブリックデータセットから抽出された数千の質問が含まれています。\n", + "\n", + "データセットをダウンロードすると、以前に使用した IMDB データセットと同様のディレクトリ構造になっていることがわかります。\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "注意: 分類問題の難易度を上げるために、プログラミングの質問での Python、CSharp、JavaScript、または Java という単語は、*blank* という単語に置き換えられました(多くの質問には、対象の言語が含まれているため)。\n", + "\n", + "この演習を完了するには、、このノートブックを変更してStackOverflow データセットを操作する必要があります。次の変更を行います。\n", + "\n", + "1. ノートブックの上部で、IMDB データセットをダウンロードするコードを、事前に準備されている [Stack Overflow データセット](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)をダウンロードするコードで更新します。Stack Overflow データセットは同様のディレクトリ構造を持っているため、多くの変更を加える必要はありません。\n", + "\n", + "2. 4 つの出力クラスがあるため、モデルの最後のレイヤーを `Dense(4)` に変更します。\n", + "\n", + "3. モデルをコンパイルするときは、損失を `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)` に変更します。これは、各クラスのラベルが整数である場合に、マルチクラス分類問題に使用する正しい損失関数です。(この場合、 0、*1*、*2*、または 3 のいずれかになります)。さらに、これはマルチクラス分類の問題であるため、メトリックを `metrics=['accuracy']` に変更します (tf.metrics.BinaryAccuracy はバイナリ分類器にのみ使用されます)。\n", + "\n", + "4. 経時的な精度をプロットする場合は、`binary_accuracy` および `val_binary_accuracy`をそれぞれ `accuracy` および `val_accuracy` に変更します。\n", + "\n", + "5. これらの変更が完了すると、マルチクラス分類器をトレーニングできるようになります。 " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 詳細\n", + "\n", + "このチュートリアルでは、最初からテキスト分類を実行する方法を紹介しました。一般的なテキスト分類ワークフローの詳細については、Google Developers の[テキスト分類ガイド](https://developers.google.com/machine-learning/guides/text-classification/)をご覧ください。\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/ko/tutorials/keras/text_classification.ipynb b/site/ko/tutorials/keras/text_classification.ipynb index 33bbca4842..74b14fda01 100644 --- a/site/ko/tutorials/keras/text_classification.ipynb +++ b/site/ko/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 영화 리뷰를 사용한 텍스트 분류" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
TensorFlow.org에서 보기 Google Colab에서 실행GitHub에서 소그 보기노트북 다운로드
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "이 튜토리얼은 디스크에 저장된 일반 텍스트 파일에서 시작하는 텍스트 분류를 보여줍니다. IMDB 데이터세트에 대한 감정 분석을 수행하도록 이진 분류기를 훈련합니다. 노트북의 마지막에는 스택 오버플로에서 프로그래밍 질문에 대한 태그를 예측하도록 다중 클래스 분류기를 훈련하는 연습을 시도해볼 수 있습니다.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## 감정 분석\n", - "\n", - "이 노트북은 리뷰 텍스트를 사용하여 영화 리뷰를 *긍정적* 또는 *부정적*으로 분류합니다. 중요하고 널리 적용 가능한 머신러닝 문제인 *이진* 분류의 예입니다.\n", - "\n", - "[IMDB 데이터세트](https://ai.stanford.edu/~amaas/data/sentiment/)에는 [인터넷 영화 데이터베이스](https://www.imdb.com/)에서 가져온 50,000개의 영화 리뷰 텍스트가 포함되어 있습니다. 훈련용 리뷰 25,000개와 테스트용 리뷰 25,000개로 나뉩니다. 훈련 및 테스트 세트는 *균형을 이룹니다*. 즉, 동일한 수의 긍정적인 리뷰와 부정적인 리뷰가 포함되어 있습니다.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### 데이터세트 다운로드 및 탐색하기\n", - "\n", - "데이터 세트를 다운로드하여 추출한 다음 디렉터리 구조를 살펴보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` 및 `aclImdb/train/neg` 디렉토리에는 각각 단일 영화를 리뷰한 많은 텍스트 파일이 포함되어 있습니다. 그 중 하나를 살펴보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### 데이터세트 로드하기\n", - "\n", - "다음으로, 디스크에서 데이터를 로드하고 훈련에 적합한 형식으로 준비합니다. 이를 위해 다음과 같은 디렉토리 구조를 예상하는 유용한 [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 유틸리티를 사용합니다.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "이진 분류를 위한 데이터세트를 준비하려면 디스크에 `class_a` 및 `class_b`에 해당하는 두 개의 폴더가 필요합니다. 이것들은 `aclImdb/train/pos` 및 `aclImdb/train/neg`에서 찾을 수 있는 긍정적 영화 리뷰와 부정적 영화 리뷰입니다. IMDB 데이터세트에는 추가 폴더가 포함되어 있으므로 이 유틸리티를 사용하기 전에 제거합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "다음으로 `text_dataset_from_directory` 유틸리티를 사용하여 레이블이 지정된 `tf.data.Dataset`를 만듭니다. [tf.data](https://www.tensorflow.org/guide/data)는 데이터 작업을 위한 강력한 도구 모음입니다.\n", - "\n", - "머신러닝 실험을 실행할 때 데이터세트를 [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set) 및 [test](https://developers.google.com/machine-learning/glossary#test-set)의 세 부분으로 나누는 것이 가장 좋습니다.\n", - "\n", - "IMDB 데이터세트는 이미 훈련과 테스트로 나누어져 있지만 검증 세트가 부족합니다. 아래 `validation_split` 인수를 사용하여 훈련 데이터를 80:20으로 분할하여 검증 세트를 생성해 보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "위에서 볼 수 있듯이 training 폴더에는 25,000개의 예제가 있으며 그 중 80%(또는 20,000개)를 훈련에 사용할 것입니다. 잠시 후에 알 수 있겠지만 데이터세트를 `model.fit`에 직접 전달하여 모델을 훈련할 수 있습니다. `tf.data`를 처음 사용하는 경우 데이터세트를 반복하고 다음과 같이 몇 가지 예를 출력할 수도 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "리뷰에는 `
`와 같은 간헐적 HTML 태그와 구두점을 포함한 원시 텍스트가 포함되어 있다는 점에 주목하세요. 다음 섹션에서 이를 처리하는 방법을 보여줍니다.\n", - "\n", - "레이블은 0 또는 1입니다. 이들 중 어느 것이 긍정적이고 부정적인 영화 리뷰에 해당하는지 확인하려면 데이터세트에서 `class_names` 속성을 확인할 수 있습니다.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "다음으로, 검증 및 테스트 데이터세트를 만듭니다. 검증을 위해 훈련 세트의 나머지 5,000개 리뷰를 사용합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "참고: `validation_split` 및 `subset` 인수를 사용할 때 검증 및 훈련 분할이 겹치지 않도록 임의 시드를 지정하거나 `shuffle=False`를 전달하는 것을 잊지 마세요." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### 훈련을 위한 데이터세트 준비하기\n", - "\n", - "다음으로, 유용한 `tf.keras.layers.TextVectorization` 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다.\n", - "\n", - "표준화는 일반적으로 구두점이나 HTML 요소를 제거하여 데이터세트를 단순화하기 위해 텍스트를 전처리하는 것을 말합니다. 토큰화는 문자열을 여러 토큰으로 분할하는 것을 말합니다(예: 화이트스페이스에서 분할하여 문장을 개별 단어로 분할). 벡터화는 토큰을 숫자로 변환하여 신경망에 공급될 수 있도록 하는 것을 말합니다. 이러한 모든 작업을 이 레이어에서 수행할 수 있습니다.\n", - "\n", - "위에서 볼 수 있듯이 리뷰에는 `
`와 같은 다양한 HTML 태그가 포함되어 있습니다. 이러한 태그는 `TextVectorization` 레이어의 기본 표준화 도구로 제거되지 않습니다(텍스트를 소문자로 변환하고 기본적으로 구두점을 제거하지만 HTML은 제거하지 않음). HTML을 제거하기 위해 사용자 정의 표준화 함수를 작성합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "참고: [훈련-테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(훈련-제공 왜곡이라고도 함)를 방지하려면 훈련 및 테스트 시간에 데이터를 동일하게 전처리하는 것이 중요합니다. 이를 용이하게 하기 위해 `TextVectorization` 레이어를 모델 내에 직접 포함할 수 있습니다. 본 튜토리얼에서 나중에 이 내용을 알아봅니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "다음으로 `TextVectorization` 레이어를 만듭니다. 이 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다. 각 토큰에 대해 고유한 정수 인덱스를 생성하도록 `output_mode`를 `int`로 설정합니다.\n", - "\n", - "기본 분할 함수와 위에서 정의한 사용자 지정 표준화 함수를 사용하고 있습니다. 명시적 최대값인 `sequence_length`와 같이 모델에 대한 몇 가지 상수를 정의하여 레이어가 시퀀스를 정확히 `sequence_length` 값으로 채우거나 자르도록 합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "다음으로, 전처리 레이어의 상태를 데이터세트에 맞추기 위해 `adapt`를 호출합니다. 그러면 모델이 문자열 인덱스를 정수로 빌드합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "참고: adapt를 호출할 때 훈련 데이터만 사용하는 것이 중요합니다(테스트세트를 사용하면 정보가 누출됨)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "이 레이어를 사용하여 일부 데이터를 전처리한 결과를 확인하는 함수를 만들어 보겠습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "위에서 볼 수 있듯이 각 토큰은 정수로 대체되었습니다. 레이어에서 `.get_vocabulary()`를 호출하여 각 정수에 해당하는 토큰(문자열)을 조회할 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "모델을 훈련할 준비가 거의 되었습니다. 최종 전처리 단계로 이전에 생성한 TextVectorization 레이어를 훈련, 검증 및 테스트 데이터세트에 적용합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### 성능을 높이도록 데이터세트 구성하기\n", - "\n", - "다음은 I/O가 차단되지 않도록 데이터를 로드할 때 사용해야 하는 두 가지 중요한 메서드입니다.\n", - "\n", - "`.cache()`는 데이터가 디스크에서 로드된 후 메모리에 데이터를 보관합니다. 이렇게 하면 모델을 훈련하는 동안 데이터세트로 인해 병목 현상이 발생하지 않습니다. 데이터세트가 너무 커서 메모리에 맞지 않는 경우, 이 메서드를 사용하여 성능이 뛰어난 온 디스크 캐시를 생성할 수도 있습니다. 많은 작은 파일보다 읽기가 더 효율적입니다.\n", - "\n", - "`.prefetch()`는 훈련 중에 데이터 전처리 및 모델 실행과 겹칩니다.\n", - "\n", - "[데이터 성능 가이드](https://www.tensorflow.org/guide/data_performance)에서 두 가지 메서드와 데이터를 디스크에 캐싱하는 방법에 관해 자세히 알아볼 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### 모델 생성\n", - "\n", - "이제 신경망을 만들 차례입니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "층을 순서대로 쌓아 분류기(classifier)를 만듭니다:\n", - "\n", - "1. 첫 번째 레이어는 `Embedding` 레이어입니다. 이 레이어는 정수로 인코딩된 리뷰를 입력 받고 각 단어 인덱스에 해당하는 임베딩 벡터를 찾습니다. 이러한 벡터는 모델이 훈련되면서 학습됩니다. 이들 벡터는 출력 배열에 차원을 추가합니다. 최종 차원은 `(batch, sequence, embedding)`이 됩니다. 임베딩에 대해 보다 자세히 알아보려면 [단어 임베딩](https://www.tensorflow.org/text/guide/word_embeddings) 튜토리얼을 확인하세요.\n", - "2. 그다음 `GlobalAveragePooling1D` 층은 `sequence` 차원에 대해 평균을 계산하여 각 샘플에 대해 고정된 길이의 출력 벡터를 반환합니다. 이는 길이가 다른 입력을 다루는 가장 간단한 방법입니다.\n", - "3. 마지막 층은 하나의 출력 노드(node)를 가진 완전 연결 층입니다. `sigmoid` 활성화 함수를 사용하여 0과 1 사이의 실수를 출력합니다. 이 값은 확률 또는 신뢰도를 나타냅니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 손실 함수와 옵티마이저\n", - "\n", - "모델이 훈련하려면 손실 함수(loss function)과 옵티마이저(optimizer)가 필요합니다. 이 예제는 이진 분류 문제이고 모델이 확률을 출력하므로(출력층의 유닛이 하나이고 `sigmoid` 활성화 함수를 사용합니다), `binary_crossentropy` 손실 함수를 사용하겠습니다.\n", - "\n", - "이제 최적화 기와 손실 함수를 사용하도록 모델을 구성합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### 모델 훈련하기\n", - "\n", - "`dataset` 개체를 fit 메서드에 전달하여 모델을 훈련합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### 모델 평가하기\n", - "\n", - "모델의 성능을 확인해 보죠. 두 개의 값이 반환됩니다. 손실(오차를 나타내는 숫자이므로 낮을수록 좋습니다)과 정확도입니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "이 상당히 단순한 접근 방식은 약 86%의 정확도를 달성합니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 정확도와 손실 그래프 그리기\n", - "\n", - "`model.fit()`은 훈련 중에 발생한 모든 것을 가진 사전을 포함하는 `History` 객체를 반환합니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "네 개의 항목이 있습니다. 훈련과 검증 단계에서 모니터링하는 지표들입니다. 훈련 손실과 검증 손실을 그래프로 그려 보고, 훈련 정확도와 검증 정확도도 그래프로 그려서 비교해 보겠습니다:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "이 그래프에서 점선은 훈련 손실과 훈련 정확도를 나타냅니다. 실선은 검증 손실과 검증 정확도입니다.\n", - "\n", - "훈련 손실은 각 epoch마다 *감소*하고 훈련 정확성은 각 epoch마다 *증가*합니다. 경사 하강 최적화를 사용할 때 이와 같이 예상됩니다. 모든 반복에서 원하는 수량을 최소화해야 합니다.\n", - "\n", - "하지만 검증 손실과 검증 정확도에서는 그렇지 못합니다. 훈련 정확도 이전이 피크인 것 같습니다. 이는 과대적합 때문입니다. 이전에 본 적 없는 데이터보다 훈련 데이터에서 모델이 더 잘 동작합니다. 이 지점부터는 모델이 과도하게 최적화되어 테스트 데이터에서 *일반화*되지 않는 훈련 데이터의 *특정* 표현을 학습합니다.\n", - "\n", - "여기에서는 과대적합을 막기 위해 단순히 검증 정확도가 더 이상 증가하지 않는 경우에 훈련을 중단할 수 있습니다. 이를 수행하는 한 가지 방법은 `tf.keras.callbacks.EarlyStopping` 콜백을 사용하는 것입니다." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## 모델 내보내기\n", - "\n", - "위의 코드에서는 모델에 텍스트를 제공하기 전에 `TextVectorization` 레이어를 데이터세트에 적용했습니다. 모델이 원시 문자열을 처리할 수 있도록 하려면(예: 배포를 단순화하기 위해) 모델 내부에 `TextVectorization` 레이어를 포함할 수 있습니다. 이를 위해 방금 훈련한 가중치를 사용하여 새 모델을 만들 수 있습니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 새로운 데이터로 추론하기\n", - "\n", - "새로운 예에 대한 예측을 얻으려면 간단히 `model.predict()`를 호출하면 됩니다." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "모델 내부에 텍스트 전처리 논리를 포함하면 배포를 단순화하고 [훈련/테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) 가능성을 줄이는 프로덕션용 모델을 내보낼 수 있습니다.\n", - "\n", - "TextVectorization 레이어를 적용할 위치를 선택할 때 염두에 두어야 할 성능 차이가 있습니다. 레이어를 모델 외부에서 사용하면 GPU에서 훈련할 때 비동기 CPU 처리 및 데이터 버퍼링을 수행할 수 있습니다. 따라서 GPU에서 모델을 훈련하는 경우 모델을 개발하는 동안 최상의 성능을 얻기 위해 이 옵션을 사용하고 배포 준비가 완료되면 모델 내부에 TextVectorization 레이어를 포함하도록 전환할 수 있습니다.\n", - "\n", - "모델 저장에 대해 자세히 알아보려면 이 [튜토리얼](https://www.tensorflow.org/tutorials/keras/save_and_load)을 방문하세요." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 연습: 스택 오버플로 질문에 대한 다중 클래스 분류\n", - "\n", - "이 튜토리얼은 IMDB 데이터세트에서 이진 분류자를 처음부터 훈련하는 방법을 보여주었습니다. 연습으로, 이 노트북을 수정하여 [스택 오버플로](http://stackoverflow.com/)에서 프로그래밍 질문의 태그를 예측하도록 다중 클래스 분류자를 훈련할 수 있습니다.\n", - "\n", - "스택 오버플로에 게시된 수천 개의 프로그래밍 질문(예: \"Python에서 값을 기준으로 사전을 정렬할 수 있는 방법은?\")의 본문이 포함된 [데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)가 준비되어 있습니다. 이들 각각은 정확히 하나의 태그(Python, CSharp, JavaScript 또는 Java)로 레이블이 지정됩니다. 여러분이 할 작업은 질문을 입력으로 받아 적절한 태그(이 경우 Python)를 예측하는 것입니다.\n", - "\n", - "작업할 데이터세트에는 1,700만 개 이상의 게시물이 포함된 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow)의 훨씬 더 큰 공개 스택 오버플로 데이터세트에서 추출한 수천 개의 질문이 포함되어 있습니다.\n", - "\n", - "데이터세트를 다운로드해 보면 이전에 작업한 IMDB 데이터세트와 유사한 디렉터리 구조를 가지고 있음을 알 수 있습니다.\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "참고: 분류 문제의 난이도를 높이기 위해 프로그래밍 질문에서 Python, CSharp, JavaScript 또는 Java라는 단어의 출현은 *blank*라는 단어로 대체되었습니다(많은 질문에 해당 언어가 포함됨).\n", - "\n", - "이 연습을 완료하려면 다음과 같이 수정하여 스택 오버플로 데이터세트와 함께 작동하도록 이 노트북을 수정해야 합니다.\n", - "\n", - "1. 노트북 상단에서, 미리 준비된 [스택 오버플로 데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)를 다운로드하는 코드로 IMDB 데이터세트를 다운로드하는 코드를 업데이트합니다. 스택 오버플로 데이터세트는 유사한 디렉터리 구조를 가지므로 많이 수정할 필요가 없습니다.\n", - "\n", - "2. 이제 4개의 출력 클래스가 있으므로 `Dense(4)`를 읽도록 모델의 마지막 레이어를 수정합니다.\n", - "\n", - "3. 모델을 컴파일할 때 손실을 `tf.keras.losses.SparseCategoricalCrossentropy`로 변경합니다. 이것은 각 클래스의 레이블이 정수일 때(이 경우 0, *1*, *2* 또는 *3*일 수 있음) 다중 클래스 분류 문제에 사용할 올바른 손실 함수입니다. 또한 이것은 다중 클래스 분류 문제이기 때문에 메트릭을 `metrics=['accuracy']`로 변경합니다(`tf.metrics.BinaryAccuracy`는 이진 분류자에만 사용됨).\n", - "\n", - "4. 시간 경과에 따른 정확도를 표시할 때 `binary_accuracy` 및 `val_binary_accuracy`를 각각 `accuracy` 및 `val_accuracy`로 변경합니다.\n", - "\n", - "5. 이러한 변경이 완료되면 다중 클래스 분류자를 훈련할 수 있습니다. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 더 알아보기\n", - "\n", - "이 튜토리얼은 텍스트 분류를 처음부터 알아보았습니다. 일반적인 텍스트 분류 워크플로에 대해 자세히 알아보려면 Google Developers의 [텍스트 분류 가이드](https://developers.google.com/machine-learning/guides/text-classification/)를 확인하세요.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 영화 리뷰를 사용한 텍스트 분류" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
TensorFlow.org에서 보기 Google Colab에서 실행GitHub에서 소그 보기노트북 다운로드
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "이 튜토리얼은 디스크에 저장된 일반 텍스트 파일에서 시작하는 텍스트 분류를 보여줍니다. IMDB 데이터세트에 대한 감정 분석을 수행하도록 이진 분류기를 훈련합니다. 노트북의 마지막에는 스택 오버플로에서 프로그래밍 질문에 대한 태그를 예측하도록 다중 클래스 분류기를 훈련하는 연습을 시도해볼 수 있습니다.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## 감정 분석\n", + "\n", + "이 노트북은 리뷰 텍스트를 사용하여 영화 리뷰를 *긍정적* 또는 *부정적*으로 분류합니다. 중요하고 널리 적용 가능한 머신러닝 문제인 *이진* 분류의 예입니다.\n", + "\n", + "[IMDB 데이터세트](https://ai.stanford.edu/~amaas/data/sentiment/)에는 [인터넷 영화 데이터베이스](https://www.imdb.com/)에서 가져온 50,000개의 영화 리뷰 텍스트가 포함되어 있습니다. 훈련용 리뷰 25,000개와 테스트용 리뷰 25,000개로 나뉩니다. 훈련 및 테스트 세트는 *균형을 이룹니다*. 즉, 동일한 수의 긍정적인 리뷰와 부정적인 리뷰가 포함되어 있습니다.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### 데이터세트 다운로드 및 탐색하기\n", + "\n", + "데이터 세트를 다운로드하여 추출한 다음 디렉터리 구조를 살펴보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` 및 `aclImdb/train/neg` 디렉토리에는 각각 단일 영화를 리뷰한 많은 텍스트 파일이 포함되어 있습니다. 그 중 하나를 살펴보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### 데이터세트 로드하기\n", + "\n", + "다음으로, 디스크에서 데이터를 로드하고 훈련에 적합한 형식으로 준비합니다. 이를 위해 다음과 같은 디렉토리 구조를 예상하는 유용한 [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 유틸리티를 사용합니다.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "이진 분류를 위한 데이터세트를 준비하려면 디스크에 `class_a` 및 `class_b`에 해당하는 두 개의 폴더가 필요합니다. 이것들은 `aclImdb/train/pos` 및 `aclImdb/train/neg`에서 찾을 수 있는 긍정적 영화 리뷰와 부정적 영화 리뷰입니다. IMDB 데이터세트에는 추가 폴더가 포함되어 있으므로 이 유틸리티를 사용하기 전에 제거합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "다음으로 `text_dataset_from_directory` 유틸리티를 사용하여 레이블이 지정된 `tf.data.Dataset`를 만듭니다. [tf.data](https://www.tensorflow.org/guide/data)는 데이터 작업을 위한 강력한 도구 모음입니다.\n", + "\n", + "머신러닝 실험을 실행할 때 데이터세트를 [train](https://developers.google.com/machine-learning/glossary#training_set), [validation](https://developers.google.com/machine-learning/glossary#validation_set) 및 [test](https://developers.google.com/machine-learning/glossary#test-set)의 세 부분으로 나누는 것이 가장 좋습니다.\n", + "\n", + "IMDB 데이터세트는 이미 훈련과 테스트로 나누어져 있지만 검증 세트가 부족합니다. 아래 `validation_split` 인수를 사용하여 훈련 데이터를 80:20으로 분할하여 검증 세트를 생성해 보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "위에서 볼 수 있듯이 training 폴더에는 25,000개의 예제가 있으며 그 중 80%(또는 20,000개)를 훈련에 사용할 것입니다. 잠시 후에 알 수 있겠지만 데이터세트를 `model.fit`에 직접 전달하여 모델을 훈련할 수 있습니다. `tf.data`를 처음 사용하는 경우 데이터세트를 반복하고 다음과 같이 몇 가지 예를 출력할 수도 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "리뷰에는 `
`와 같은 간헐적 HTML 태그와 구두점을 포함한 원시 텍스트가 포함되어 있다는 점에 주목하세요. 다음 섹션에서 이를 처리하는 방법을 보여줍니다.\n", + "\n", + "레이블은 0 또는 1입니다. 이들 중 어느 것이 긍정적이고 부정적인 영화 리뷰에 해당하는지 확인하려면 데이터세트에서 `class_names` 속성을 확인할 수 있습니다.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "다음으로, 검증 및 테스트 데이터세트를 만듭니다. 검증을 위해 훈련 세트의 나머지 5,000개 리뷰를 사용합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "참고: `validation_split` 및 `subset` 인수를 사용할 때 검증 및 훈련 분할이 겹치지 않도록 임의 시드를 지정하거나 `shuffle=False`를 전달하는 것을 잊지 마세요." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### 훈련을 위한 데이터세트 준비하기\n", + "\n", + "다음으로, 유용한 `tf.keras.layers.TextVectorization` 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다.\n", + "\n", + "표준화는 일반적으로 구두점이나 HTML 요소를 제거하여 데이터세트를 단순화하기 위해 텍스트를 전처리하는 것을 말합니다. 토큰화는 문자열을 여러 토큰으로 분할하는 것을 말합니다(예: 화이트스페이스에서 분할하여 문장을 개별 단어로 분할). 벡터화는 토큰을 숫자로 변환하여 신경망에 공급될 수 있도록 하는 것을 말합니다. 이러한 모든 작업을 이 레이어에서 수행할 수 있습니다.\n", + "\n", + "위에서 볼 수 있듯이 리뷰에는 `
`와 같은 다양한 HTML 태그가 포함되어 있습니다. 이러한 태그는 `TextVectorization` 레이어의 기본 표준화 도구로 제거되지 않습니다(텍스트를 소문자로 변환하고 기본적으로 구두점을 제거하지만 HTML은 제거하지 않음). HTML을 제거하기 위해 사용자 정의 표준화 함수를 작성합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "참고: [훈련-테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(훈련-제공 왜곡이라고도 함)를 방지하려면 훈련 및 테스트 시간에 데이터를 동일하게 전처리하는 것이 중요합니다. 이를 용이하게 하기 위해 `TextVectorization` 레이어를 모델 내에 직접 포함할 수 있습니다. 본 튜토리얼에서 나중에 이 내용을 알아봅니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "다음으로 `TextVectorization` 레이어를 만듭니다. 이 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다. 각 토큰에 대해 고유한 정수 인덱스를 생성하도록 `output_mode`를 `int`로 설정합니다.\n", + "\n", + "기본 분할 함수와 위에서 정의한 사용자 지정 표준화 함수를 사용하고 있습니다. 명시적 최대값인 `sequence_length`와 같이 모델에 대한 몇 가지 상수를 정의하여 레이어가 시퀀스를 정확히 `sequence_length` 값으로 채우거나 자르도록 합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "다음으로, 전처리 레이어의 상태를 데이터세트에 맞추기 위해 `adapt`를 호출합니다. 그러면 모델이 문자열 인덱스를 정수로 빌드합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "참고: adapt를 호출할 때 훈련 데이터만 사용하는 것이 중요합니다(테스트세트를 사용하면 정보가 누출됨)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "이 레이어를 사용하여 일부 데이터를 전처리한 결과를 확인하는 함수를 만들어 보겠습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "위에서 볼 수 있듯이 각 토큰은 정수로 대체되었습니다. 레이어에서 `.get_vocabulary()`를 호출하여 각 정수에 해당하는 토큰(문자열)을 조회할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "모델을 훈련할 준비가 거의 되었습니다. 최종 전처리 단계로 이전에 생성한 TextVectorization 레이어를 훈련, 검증 및 테스트 데이터세트에 적용합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### 성능을 높이도록 데이터세트 구성하기\n", + "\n", + "다음은 I/O가 차단되지 않도록 데이터를 로드할 때 사용해야 하는 두 가지 중요한 메서드입니다.\n", + "\n", + "`.cache()`는 데이터가 디스크에서 로드된 후 메모리에 데이터를 보관합니다. 이렇게 하면 모델을 훈련하는 동안 데이터세트로 인해 병목 현상이 발생하지 않습니다. 데이터세트가 너무 커서 메모리에 맞지 않는 경우, 이 메서드를 사용하여 성능이 뛰어난 온 디스크 캐시를 생성할 수도 있습니다. 많은 작은 파일보다 읽기가 더 효율적입니다.\n", + "\n", + "`.prefetch()`는 훈련 중에 데이터 전처리 및 모델 실행과 겹칩니다.\n", + "\n", + "[데이터 성능 가이드](https://www.tensorflow.org/guide/data_performance)에서 두 가지 메서드와 데이터를 디스크에 캐싱하는 방법에 관해 자세히 알아볼 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### 모델 생성\n", + "\n", + "이제 신경망을 만들 차례입니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "층을 순서대로 쌓아 분류기(classifier)를 만듭니다:\n", + "\n", + "1. 첫 번째 레이어는 `Embedding` 레이어입니다. 이 레이어는 정수로 인코딩된 리뷰를 입력 받고 각 단어 인덱스에 해당하는 임베딩 벡터를 찾습니다. 이러한 벡터는 모델이 훈련되면서 학습됩니다. 이들 벡터는 출력 배열에 차원을 추가합니다. 최종 차원은 `(batch, sequence, embedding)`이 됩니다. 임베딩에 대해 보다 자세히 알아보려면 [단어 임베딩](https://www.tensorflow.org/text/guide/word_embeddings) 튜토리얼을 확인하세요.\n", + "2. 그다음 `GlobalAveragePooling1D` 층은 `sequence` 차원에 대해 평균을 계산하여 각 샘플에 대해 고정된 길이의 출력 벡터를 반환합니다. 이는 길이가 다른 입력을 다루는 가장 간단한 방법입니다.\n", + "3. 마지막 층은 하나의 출력 노드(node)를 가진 완전 연결 층입니다. `sigmoid` 활성화 함수를 사용하여 0과 1 사이의 실수를 출력합니다. 이 값은 확률 또는 신뢰도를 나타냅니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 손실 함수와 옵티마이저\n", + "\n", + "모델이 훈련하려면 손실 함수(loss function)과 옵티마이저(optimizer)가 필요합니다. 이 예제는 이진 분류 문제이고 모델이 확률을 출력하므로(출력층의 유닛이 하나이고 `sigmoid` 활성화 함수를 사용합니다), `binary_crossentropy` 손실 함수를 사용하겠습니다.\n", + "\n", + "이제 최적화 기와 손실 함수를 사용하도록 모델을 구성합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### 모델 훈련하기\n", + "\n", + "`dataset` 개체를 fit 메서드에 전달하여 모델을 훈련합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### 모델 평가하기\n", + "\n", + "모델의 성능을 확인해 보죠. 두 개의 값이 반환됩니다. 손실(오차를 나타내는 숫자이므로 낮을수록 좋습니다)과 정확도입니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "이 상당히 단순한 접근 방식은 약 86%의 정확도를 달성합니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 정확도와 손실 그래프 그리기\n", + "\n", + "`model.fit()`은 훈련 중에 발생한 모든 것을 가진 사전을 포함하는 `History` 객체를 반환합니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "네 개의 항목이 있습니다. 훈련과 검증 단계에서 모니터링하는 지표들입니다. 훈련 손실과 검증 손실을 그래프로 그려 보고, 훈련 정확도와 검증 정확도도 그래프로 그려서 비교해 보겠습니다:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "이 그래프에서 점선은 훈련 손실과 훈련 정확도를 나타냅니다. 실선은 검증 손실과 검증 정확도입니다.\n", + "\n", + "훈련 손실은 각 epoch마다 *감소*하고 훈련 정확성은 각 epoch마다 *증가*합니다. 경사 하강 최적화를 사용할 때 이와 같이 예상됩니다. 모든 반복에서 원하는 수량을 최소화해야 합니다.\n", + "\n", + "하지만 검증 손실과 검증 정확도에서는 그렇지 못합니다. 훈련 정확도 이전이 피크인 것 같습니다. 이는 과대적합 때문입니다. 이전에 본 적 없는 데이터보다 훈련 데이터에서 모델이 더 잘 동작합니다. 이 지점부터는 모델이 과도하게 최적화되어 테스트 데이터에서 *일반화*되지 않는 훈련 데이터의 *특정* 표현을 학습합니다.\n", + "\n", + "여기에서는 과대적합을 막기 위해 단순히 검증 정확도가 더 이상 증가하지 않는 경우에 훈련을 중단할 수 있습니다. 이를 수행하는 한 가지 방법은 `tf.keras.callbacks.EarlyStopping` 콜백을 사용하는 것입니다." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## 모델 내보내기\n", + "\n", + "위의 코드에서는 모델에 텍스트를 제공하기 전에 `TextVectorization` 레이어를 데이터세트에 적용했습니다. 모델이 원시 문자열을 처리할 수 있도록 하려면(예: 배포를 단순화하기 위해) 모델 내부에 `TextVectorization` 레이어를 포함할 수 있습니다. 이를 위해 방금 훈련한 가중치를 사용하여 새 모델을 만들 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 새로운 데이터로 추론하기\n", + "\n", + "새로운 예에 대한 예측을 얻으려면 간단히 `model.predict()`를 호출하면 됩니다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "모델 내부에 텍스트 전처리 논리를 포함하면 배포를 단순화하고 [훈련/테스트 왜곡](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) 가능성을 줄이는 프로덕션용 모델을 내보낼 수 있습니다.\n", + "\n", + "TextVectorization 레이어를 적용할 위치를 선택할 때 염두에 두어야 할 성능 차이가 있습니다. 레이어를 모델 외부에서 사용하면 GPU에서 훈련할 때 비동기 CPU 처리 및 데이터 버퍼링을 수행할 수 있습니다. 따라서 GPU에서 모델을 훈련하는 경우 모델을 개발하는 동안 최상의 성능을 얻기 위해 이 옵션을 사용하고 배포 준비가 완료되면 모델 내부에 TextVectorization 레이어를 포함하도록 전환할 수 있습니다.\n", + "\n", + "모델 저장에 대해 자세히 알아보려면 이 [튜토리얼](https://www.tensorflow.org/tutorials/keras/save_and_load)을 방문하세요." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 연습: 스택 오버플로 질문에 대한 다중 클래스 분류\n", + "\n", + "이 튜토리얼은 IMDB 데이터세트에서 이진 분류자를 처음부터 훈련하는 방법을 보여주었습니다. 연습으로, 이 노트북을 수정하여 [스택 오버플로](http://stackoverflow.com/)에서 프로그래밍 질문의 태그를 예측하도록 다중 클래스 분류자를 훈련할 수 있습니다.\n", + "\n", + "스택 오버플로에 게시된 수천 개의 프로그래밍 질문(예: \"Python에서 값을 기준으로 사전을 정렬할 수 있는 방법은?\")의 본문이 포함된 [데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)가 준비되어 있습니다. 이들 각각은 정확히 하나의 태그(Python, CSharp, JavaScript 또는 Java)로 레이블이 지정됩니다. 여러분이 할 작업은 질문을 입력으로 받아 적절한 태그(이 경우 Python)를 예측하는 것입니다.\n", + "\n", + "작업할 데이터세트에는 1,700만 개 이상의 게시물이 포함된 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow)의 훨씬 더 큰 공개 스택 오버플로 데이터세트에서 추출한 수천 개의 질문이 포함되어 있습니다.\n", + "\n", + "데이터세트를 다운로드해 보면 이전에 작업한 IMDB 데이터세트와 유사한 디렉터리 구조를 가지고 있음을 알 수 있습니다.\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "참고: 분류 문제의 난이도를 높이기 위해 프로그래밍 질문에서 Python, CSharp, JavaScript 또는 Java라는 단어의 출현은 *blank*라는 단어로 대체되었습니다(많은 질문에 해당 언어가 포함됨).\n", + "\n", + "이 연습을 완료하려면 다음과 같이 수정하여 스택 오버플로 데이터세트와 함께 작동하도록 이 노트북을 수정해야 합니다.\n", + "\n", + "1. 노트북 상단에서, 미리 준비된 [스택 오버플로 데이터세트](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)를 다운로드하는 코드로 IMDB 데이터세트를 다운로드하는 코드를 업데이트합니다. 스택 오버플로 데이터세트는 유사한 디렉터리 구조를 가지므로 많이 수정할 필요가 없습니다.\n", + "\n", + "2. 이제 4개의 출력 클래스가 있으므로 `Dense(4)`를 읽도록 모델의 마지막 레이어를 수정합니다.\n", + "\n", + "3. 모델을 컴파일할 때 손실을 `tf.keras.losses.SparseCategoricalCrossentropy`로 변경합니다. 이것은 각 클래스의 레이블이 정수일 때(이 경우 0, *1*, *2* 또는 *3*일 수 있음) 다중 클래스 분류 문제에 사용할 올바른 손실 함수입니다. 또한 이것은 다중 클래스 분류 문제이기 때문에 메트릭을 `metrics=['accuracy']`로 변경합니다(`tf.metrics.BinaryAccuracy`는 이진 분류자에만 사용됨).\n", + "\n", + "4. 시간 경과에 따른 정확도를 표시할 때 `binary_accuracy` 및 `val_binary_accuracy`를 각각 `accuracy` 및 `val_accuracy`로 변경합니다.\n", + "\n", + "5. 이러한 변경이 완료되면 다중 클래스 분류자를 훈련할 수 있습니다. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 더 알아보기\n", + "\n", + "이 튜토리얼은 텍스트 분류를 처음부터 알아보았습니다. 일반적인 텍스트 분류 워크플로에 대해 자세히 알아보려면 Google Developers의 [텍스트 분류 가이드](https://developers.google.com/machine-learning/guides/text-classification/)를 확인하세요.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/pt-br/tutorials/keras/text_classification.ipynb b/site/pt-br/tutorials/keras/text_classification.ipynb index 35336e943b..259fe0d015 100644 --- a/site/pt-br/tutorials/keras/text_classification.ipynb +++ b/site/pt-br/tutorials/keras/text_classification.ipynb @@ -1,978 +1,978 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# Classificação de texto" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
Ver em TensorFlow.org\n", - " Executar no Google Colab\n", - " Ver fonte no GitHub\n", - " Baixar notebook\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "Este tutorial demonstra a classificação de texto, começando pela classificação de arquivos de texto sem formatação armazenados no disco. Você treinará um classificador binário para fazer análise de sentimento para um dataset do IMDB. No final do notebook, você poderá fazer um exercício, em que treinará um classificador multiclasse para prever a tag de uma pergunta de programação no Stack Overflow.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## Análise de sentimento\n", - "\n", - "Este notebook treina um modelo de análise de sentimento para classificar avaliações de filmes como *positivas* ou *negativas*, com base no texto da avaliação. Este é um exemplo de classificação *binária*, ou de duas classes, um tipo de problema de aprendizado de máquina importante, com diversas aplicações.\n", - "\n", - "Você usará o [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/), que contém o texto de 50 mil avaliações de filmes do [Internet Movie Database](https://www.imdb.com/). Elas são divididas em 25 mil avaliações para treinamento e 25 mil para teste. Os conjuntos de treinamento e teste são *equilibrados*, ou seja, contêm a mesma quantidade de avaliações positivas e negativas.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### Baixe e explore o dataset do IMDB\n", - "\n", - "Vamos baixar e extrair o dataset, depois vamos explorar a estrutura de diretórios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "Os diretórios `aclImdb/train/pos` e `aclImdb/train/neg` contêm diversos arquivos de texto, sendo que cada um é uma única avaliação de filme. Vamos dar uma olhada em um desses arquivos." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### Carregue o dataset\n", - "\n", - "Agora, você vai carregar os dados para fora do disco e colocá-los em um formato adequado para o treinamento. Para isso, você usará um utilitário muito útil, o [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera uma estrutura de diretórios, como mostrado abaixo.\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "Para preparar um dataset para fazer classificação binária, você precisa de duas pastas no disco, correspondentes a `class_a` e `class_b`. Elas conterão avaliações positivas e negativas de filmes, que podem ser encontradas em `aclImdb/train/pos` e `aclImdb/train/neg`. Como o dataset do IMDB contém pastas adicionais, você vai removê-las antes de usar o utilitário." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "Agora, você usará o utilitário `text_dataset_from_directory` para criar um `tf.data.Dataset` com rótulos. [tf.data](https://www.tensorflow.org/guide/data) é uma coleção de ferramentas avançadas para trabalhar com dados.\n", - "\n", - "Ao realizar um experimento de aprendizado de máquina, é uma prática recomendada dividir o dataset em três: [treinamento](https://developers.google.com/machine-learning/glossary#training_set), [validação](https://developers.google.com/machine-learning/glossary#validation_set) e [teste](https://developers.google.com/machine-learning/glossary#test-set).\n", - "\n", - "O dataset do IMDB já foi dividido em conjuntos de treinamento e teste, mas ainda falta um de validação. Vamos criar um conjunto de validação utilizando uma divisão 80/20 para os dados do treinamento por meio do argumento `validation_split` abaixo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "Como podemos ver acima, há 25 mil exemplos na pasta de treinamento, das quais serão usadas 80%, ou 20 mil, para treinamento. Como veremos em breve, você pode treinar um modelo passando um dataset diretamente para `model.fit`. Se você ainda estiver aprendendo sobre `tf.data`, também pode fazer a iteração do dataset e exibir alguns exemplos, conforme mostrado abaixo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "Observe que a avaliação contém texto bruto (com pontuações e tags HTML, como `
`). Você verá como lidar com isso na próxima seção.\n", - "\n", - "Os rótulos são 0 e 1. Para ver qual deles corresponde a avaliações positivas ou negativas de filmes, confira a propriedade `class_names` do dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "Em seguida, você criará um dataset de validação e de teste. Você usará as 5 mil avaliações restantes do conjunto de treinamento para a validação." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "Observação: ao usar os argumentos `validation_split` e `subset`, especifique uma semente aleatória ou passe `shuffle=False` para que as divisões de validação e treinamento não se sobreponham." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### Prepare o dataset para treinamento\n", - "\n", - "Em seguida, você vai padronizar, tokenizar e vetorizar os dados usando a camada `tf.keras.layers.TextVectorization`.\n", - "\n", - "Padronização refere-se ao pré-processamento do texto, tipicamente para remover pontuações ou elementos HTML a fim de simplificar o dataset. Tokenização refere-se à divisão das strings em tokens (por exemplo, dividir uma frase em palavras individuais, fazendo a divisão a cada espaço). Vetorização refere-se à conversão de tokens em números para que eles possam ser alimentados em uma rede neural. Todas essas tarefas podem ser feitas com essa camada.\n", - "\n", - "Como visto acima, as avaliações contêm diversas tags HTML, como `
`. Elas não serão removidas pelo padronizador padrão na camada `TextVectorization` (que converte texto em letras minúsculas e remove as pontuações por padrão, mas não retira código HTML). Você escreverá uma função de padronização personalizada para remover código HTML." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "Observação: para evitar o [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (também conhecido como desvio de treinamento/serviço), é importante pré-processar os dados de forma idêntica no momento de treinamento e teste. Para isso, a camada `TextVectorization` pode ser incluída diretamente dentro do modelo, conforme exibido posteriormente neste tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "Em seguida, você criará uma camada `TextVectorization`, que será usada para padronizar, tokenizar e vetorizar os dados. Você deve definir `output_mode` como `int` para criar índices de inteiros únicos para cada token.\n", - "\n", - "Observe que você está utilizando a função de divisão padrão e a função de padronização personalizada definida acima. Você também definirá algumas constantes para o modelo, como um mínimo explícito `sequence_length`, que fará a camada preencher ou truncar sequências para valores exatamente iguais a `sequence_length`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "Em seguida, chame `adapt` para adequar o estado da camada de pré-processamento ao dataset. Isso fará com que o modelo crie um índice de strings para os números inteiros." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "Observação: é importante usar somente os dados de treinamento ao chamar adapt, já que o uso do dataset de teste vazaria informações." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "Vamos criar uma função para ver o resultado ao usar esta camada para pré-processar alguns dados." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "Conforme visto acima, cada token foi substituído por um inteiro. Para visualizar o token (string) ao qual cada inteiro corresponde, você pode chamar `.get_vocabulary()` na camada." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "Está quase tudo pronto para treinar o modelo. Como etapa final de pré-processamento, você aplicará a camada TextVectorization criada anteriormente aos datasets de treinamento, validação e teste." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### Configure o dataset para melhor desempenho\n", - "\n", - "Há dois métodos importantes que você deve usar ao carregar os dados para garantir que a I/O não seja bloqueada.\n", - "\n", - "`.cache` mantém os dados na memória após o carregamento fora do disco. Isso garante que o dataset não se torne um gargalo ao treinar seu modelo. Se o dataset for muito grande para a memória, você também pode usar esse método para criar um cache no disco eficaz, que tem uma leitura mais eficiente do que vários arquivos pequenos.\n", - "\n", - "`/prefetch` sobrepõe o pré-processamento de dados e a execução do modelo durante o treinamento.\n", - "\n", - "Saiba mais sobre ambos os métodos, além de como armazenar os dados em cache no disco, no [guia sobre desempenho dos dados](https://www.tensorflow.org/guide/data_performance)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### Crie o modelo\n", - "\n", - "Chegou a hora de criar sua rede neural:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "As camadas são empilhadas sequencialmente para construir o classificador:\n", - "\n", - "1. A primeira é uma camada `Embedding`, que recebe avaliações codificadas em inteiros e avalia um vetor de embedding para cada palavra-índice. Esses vetores são aprendidos à medida que o modelo é treinado. Os vetores acrescentam uma dimensão à matriz de saída. As dimensões resultantes são: `(batch, sequence, embedding)` (lote, sequência, embedding). Para saber mais sobre embeddings, confira o tutorial [Embeddings de palavras](https://www.tensorflow.org/text/guide/word_embeddings).\n", - "2. A segunda camada é `GlobalAveragePooling1D`, que retorna um vetor de saída de tamanho fixo para cada exemplo, calculando a média da dimensão de sequência. Dessa forma, o modelo consegue lidar com entradas de tamanho variável da forma mais simples possível.\n", - "3. A última camada é densamente conectada com um único nó de saída." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### Função de perda e otimizador\n", - "\n", - "Todo modelo precisa de uma função de perda e um otimizador para o treinamento. Como este é um problema de classificação binária e o modelo gera como saída uma probabilidade (uma camada de unidade única com uma ativação sigmóide), você usará a função de perda `losses.BinaryCrossentropy`.\n", - "\n", - "Agora, configure o modelo para usar um otimizador e uma função de perda:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### Treine o modelo\n", - "\n", - "Você passará o objeto `dataset` ao método fit para treinar o modelo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### Avalie o modelo\n", - "\n", - "Vamos conferir o desempenho do modelo. Serão retornados dois valores: perda (um número que representa o erro; quanto menor, melhor) e exatidão." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "Essa estratégia bem simples atinge uma exatidão de cerca de 86%." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### Crie um gráfico de exatidão e perda ao longo do tempo\n", - "\n", - "`model.fit()` retorna um objeto `History` que contém um dicionário com tudo o que aconteceu durante o treinamento:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "Há quatro entradas: uma para cada métrica monitorada durante o treinamento e a validação. Você usará esses valores para plotar a perda do treinamento e da validação para fins comparativos, além da exatidão do treinamento e da validação:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "Neste gráfico, os pontos representam a perda e exatidão do treinamento, enquanto as linhas sólidas representam a perda e exatidão da validação.\n", - "\n", - "Observe que a perda do treinamento *diminui* a cada época, e a exatidão do treinamento *aumenta* a cada época. Isso é o esperado ao usar uma otimização do método do gradiente descendente, que deve minimizar a quantidade desejada em cada iteração.\n", - "\n", - "Esse não é o caso para a perda e exatidão de validação, que parecem atingir o pico antes da exatidão do treinamento. Este é um exemplo de overfitting: o modelo tem desempenho melhor com os dados de treinamento em comparação a dados nunca vistos antes. Após esse ponto, o modelo sofre uma sobreotimização e aprende representações *específicas* dos dados de treinamento que não oferecem boas *generalizações* para os dados de teste.\n", - "\n", - "Para este caso específico, é possível evitar o overfitting simplesmente parando o treinamento quando a exatidão da validação deixa de aumentar. Uma forma de fazer isso é usando o callback `tf.keras.callbacks.EarlyStopping`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## Exporte o modelo\n", - "\n", - "No código acima, você aplicou a camada `TextVectorization` ao dataset antes de alimentar o modelo com texto. Se quiser tornar o modelo capaz de processar strings brutas (por exemplo, para simplificar a implantação), é possível incluir a camada `TextVectorization` dentro do modelo. Para isso, você pode criar um novo modelo usando os pesos que acabou de treinar." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### Inferência de dados novos\n", - "\n", - "Para fazer previsões para novos exemplos, basta chamar `model.predict()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "Ao incluir a lógica de pré-processamento de texto dentro do modelo, você pode exportar um modelo para produção que simplifica a implantação e reduz o potencial de [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", - "\n", - "Há uma diferença de desempenho que você deve considerar ao escolher onde aplicar a camada TextVectorization. Ao usá-la fora do modelo, você pode fazer o processamento assíncrono na CPU e armazenar os dados em buffer ao treinar na GPU. Portanto, se você estiver treinando seu modelo na GPU, deve escolher essa opção para obter o melhor desempenho ao desenvolver o modelo. Depois, quando você estiver pronto para preparar a implantação, inclua a camada TextVectorization dentro do modelo.\n", - "\n", - "Confira este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para saber mais sobre como salvar modelos." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## Exercício: classificação multiclasse para perguntas do Stack Overflow\n", - "\n", - "Este tutorial mostrou como treinar um classificador binário do zero usando o dataset do IMDB. Você pode fazer um exercício: modifique este notebook para treinar um classificador multiclasse que preveja a tag de uma pergunta de programação feita no [Stack Overflow](http://stackoverflow.com/).\n", - "\n", - "Um [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) foi preparado para uso, contendo o texto de milhares de perguntas de programação (por exemplo, \"Como posso ordenar um dicionário por valor no Python?\") publicadas no Stack Overflow. Cada pergunta é rotulada com exatamente uma tag (Python, CSharp, JavaScript ou Java). Sua tarefa é receber uma pergunta como entrada e prever a tag apropriada, que, neste caso, é Python.\n", - "\n", - "Você usará um dataset que contém milhares de perguntas extraídas do dataset público do Stack Overflow, que é bem maior, no [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), contendo mais de 17 milhões de publicações.\n", - "\n", - "Após baixar o dataset, você verá que ele tem uma estrutura de diretórios similar ao dataset do IMDB utilizado anteriormente:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "Observação: para aumentar a dificuldade do problema de classificação, as ocorrências das palavras Python, CSharp, JavaScript e Java nas perguntas de programação foram substituídas pela palavra *blank* (em branco), já que diversas perguntas contêm a linguagem de programação em questão.\n", - "\n", - "Para fazer este exercício, você deve modificar este notebook para que funcione com o dataset do Stack Overflow das seguintes maneiras:\n", - "\n", - "1. Na parte superior do notebook, atualize o código que baixa o dataset do IMDB com o código que baixa o [dataset do Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz), que já foi preparado. Como o dataset do Stack Overflow tem uma estrutura de diretórios parecida, você não precisará fazer muitas modificações.\n", - "\n", - "2. Modifique a última camada do modelo para `Dense(4)`, pois agora há quatro classes de saída.\n", - "\n", - "3. Ao compilar o modelo, altere a perda para `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta é a função de perda correta a ser usada para problemas de classificação muilticlasse, quando os rótulos de cada classe são inteiros (neste caso, podem ser 0, *1*, *2* ou *3*). Além disso, altere as métricas para `metrics=['accuracy']`, já que este é um problema de classificação multicasse (`tf.metrics.BinaryAccuracy` é usado somente para classificadores binários).\n", - "\n", - "4. Ao plotar a precisão ao longo do tempo, altere `binary_accuracy` e `val_binary_accuracy` para `accuracy` e `val_accuracy`, respectivamente.\n", - "\n", - "5. Após fazer essas alterações, você poderá treinar um classificador multiclasse. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## Saiba mais\n", - "\n", - "Este tutorial mostrou como fazer a classificação de texto do zero. Para saber mais sobre o workflow de classificação de texto de forma geral, confira o [guia Classificação de texto](https://developers.google.com/machine-learning/guides/text-classification/) no Google Developers.\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# Classificação de texto" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
Ver em TensorFlow.org\n", + " Executar no Google Colab\n", + " Ver fonte no GitHub\n", + " Baixar notebook\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "Este tutorial demonstra a classificação de texto, começando pela classificação de arquivos de texto sem formatação armazenados no disco. Você treinará um classificador binário para fazer análise de sentimento para um dataset do IMDB. No final do notebook, você poderá fazer um exercício, em que treinará um classificador multiclasse para prever a tag de uma pergunta de programação no Stack Overflow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## Análise de sentimento\n", + "\n", + "Este notebook treina um modelo de análise de sentimento para classificar avaliações de filmes como *positivas* ou *negativas*, com base no texto da avaliação. Este é um exemplo de classificação *binária*, ou de duas classes, um tipo de problema de aprendizado de máquina importante, com diversas aplicações.\n", + "\n", + "Você usará o [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/), que contém o texto de 50 mil avaliações de filmes do [Internet Movie Database](https://www.imdb.com/). Elas são divididas em 25 mil avaliações para treinamento e 25 mil para teste. Os conjuntos de treinamento e teste são *equilibrados*, ou seja, contêm a mesma quantidade de avaliações positivas e negativas.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### Baixe e explore o dataset do IMDB\n", + "\n", + "Vamos baixar e extrair o dataset, depois vamos explorar a estrutura de diretórios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "Os diretórios `aclImdb/train/pos` e `aclImdb/train/neg` contêm diversos arquivos de texto, sendo que cada um é uma única avaliação de filme. Vamos dar uma olhada em um desses arquivos." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### Carregue o dataset\n", + "\n", + "Agora, você vai carregar os dados para fora do disco e colocá-los em um formato adequado para o treinamento. Para isso, você usará um utilitário muito útil, o [text_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory), que espera uma estrutura de diretórios, como mostrado abaixo.\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "Para preparar um dataset para fazer classificação binária, você precisa de duas pastas no disco, correspondentes a `class_a` e `class_b`. Elas conterão avaliações positivas e negativas de filmes, que podem ser encontradas em `aclImdb/train/pos` e `aclImdb/train/neg`. Como o dataset do IMDB contém pastas adicionais, você vai removê-las antes de usar o utilitário." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "Agora, você usará o utilitário `text_dataset_from_directory` para criar um `tf.data.Dataset` com rótulos. [tf.data](https://www.tensorflow.org/guide/data) é uma coleção de ferramentas avançadas para trabalhar com dados.\n", + "\n", + "Ao realizar um experimento de aprendizado de máquina, é uma prática recomendada dividir o dataset em três: [treinamento](https://developers.google.com/machine-learning/glossary#training_set), [validação](https://developers.google.com/machine-learning/glossary#validation_set) e [teste](https://developers.google.com/machine-learning/glossary#test-set).\n", + "\n", + "O dataset do IMDB já foi dividido em conjuntos de treinamento e teste, mas ainda falta um de validação. Vamos criar um conjunto de validação utilizando uma divisão 80/20 para os dados do treinamento por meio do argumento `validation_split` abaixo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "Como podemos ver acima, há 25 mil exemplos na pasta de treinamento, das quais serão usadas 80%, ou 20 mil, para treinamento. Como veremos em breve, você pode treinar um modelo passando um dataset diretamente para `model.fit`. Se você ainda estiver aprendendo sobre `tf.data`, também pode fazer a iteração do dataset e exibir alguns exemplos, conforme mostrado abaixo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "Observe que a avaliação contém texto bruto (com pontuações e tags HTML, como `
`). Você verá como lidar com isso na próxima seção.\n", + "\n", + "Os rótulos são 0 e 1. Para ver qual deles corresponde a avaliações positivas ou negativas de filmes, confira a propriedade `class_names` do dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "Em seguida, você criará um dataset de validação e de teste. Você usará as 5 mil avaliações restantes do conjunto de treinamento para a validação." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "Observação: ao usar os argumentos `validation_split` e `subset`, especifique uma semente aleatória ou passe `shuffle=False` para que as divisões de validação e treinamento não se sobreponham." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### Prepare o dataset para treinamento\n", + "\n", + "Em seguida, você vai padronizar, tokenizar e vetorizar os dados usando a camada `tf.keras.layers.TextVectorization`.\n", + "\n", + "Padronização refere-se ao pré-processamento do texto, tipicamente para remover pontuações ou elementos HTML a fim de simplificar o dataset. Tokenização refere-se à divisão das strings em tokens (por exemplo, dividir uma frase em palavras individuais, fazendo a divisão a cada espaço). Vetorização refere-se à conversão de tokens em números para que eles possam ser alimentados em uma rede neural. Todas essas tarefas podem ser feitas com essa camada.\n", + "\n", + "Como visto acima, as avaliações contêm diversas tags HTML, como `
`. Elas não serão removidas pelo padronizador padrão na camada `TextVectorization` (que converte texto em letras minúsculas e remove as pontuações por padrão, mas não retira código HTML). Você escreverá uma função de padronização personalizada para remover código HTML." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "Observação: para evitar o [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew) (também conhecido como desvio de treinamento/serviço), é importante pré-processar os dados de forma idêntica no momento de treinamento e teste. Para isso, a camada `TextVectorization` pode ser incluída diretamente dentro do modelo, conforme exibido posteriormente neste tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "Em seguida, você criará uma camada `TextVectorization`, que será usada para padronizar, tokenizar e vetorizar os dados. Você deve definir `output_mode` como `int` para criar índices de inteiros únicos para cada token.\n", + "\n", + "Observe que você está utilizando a função de divisão padrão e a função de padronização personalizada definida acima. Você também definirá algumas constantes para o modelo, como um mínimo explícito `sequence_length`, que fará a camada preencher ou truncar sequências para valores exatamente iguais a `sequence_length`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "Em seguida, chame `adapt` para adequar o estado da camada de pré-processamento ao dataset. Isso fará com que o modelo crie um índice de strings para os números inteiros." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "Observação: é importante usar somente os dados de treinamento ao chamar adapt, já que o uso do dataset de teste vazaria informações." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "Vamos criar uma função para ver o resultado ao usar esta camada para pré-processar alguns dados." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "Conforme visto acima, cada token foi substituído por um inteiro. Para visualizar o token (string) ao qual cada inteiro corresponde, você pode chamar `.get_vocabulary()` na camada." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "Está quase tudo pronto para treinar o modelo. Como etapa final de pré-processamento, você aplicará a camada TextVectorization criada anteriormente aos datasets de treinamento, validação e teste." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### Configure o dataset para melhor desempenho\n", + "\n", + "Há dois métodos importantes que você deve usar ao carregar os dados para garantir que a I/O não seja bloqueada.\n", + "\n", + "`.cache` mantém os dados na memória após o carregamento fora do disco. Isso garante que o dataset não se torne um gargalo ao treinar seu modelo. Se o dataset for muito grande para a memória, você também pode usar esse método para criar um cache no disco eficaz, que tem uma leitura mais eficiente do que vários arquivos pequenos.\n", + "\n", + "`/prefetch` sobrepõe o pré-processamento de dados e a execução do modelo durante o treinamento.\n", + "\n", + "Saiba mais sobre ambos os métodos, além de como armazenar os dados em cache no disco, no [guia sobre desempenho dos dados](https://www.tensorflow.org/guide/data_performance)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### Crie o modelo\n", + "\n", + "Chegou a hora de criar sua rede neural:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "As camadas são empilhadas sequencialmente para construir o classificador:\n", + "\n", + "1. A primeira é uma camada `Embedding`, que recebe avaliações codificadas em inteiros e avalia um vetor de embedding para cada palavra-índice. Esses vetores são aprendidos à medida que o modelo é treinado. Os vetores acrescentam uma dimensão à matriz de saída. As dimensões resultantes são: `(batch, sequence, embedding)` (lote, sequência, embedding). Para saber mais sobre embeddings, confira o tutorial [Embeddings de palavras](https://www.tensorflow.org/text/guide/word_embeddings).\n", + "2. A segunda camada é `GlobalAveragePooling1D`, que retorna um vetor de saída de tamanho fixo para cada exemplo, calculando a média da dimensão de sequência. Dessa forma, o modelo consegue lidar com entradas de tamanho variável da forma mais simples possível.\n", + "3. A última camada é densamente conectada com um único nó de saída." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### Função de perda e otimizador\n", + "\n", + "Todo modelo precisa de uma função de perda e um otimizador para o treinamento. Como este é um problema de classificação binária e o modelo gera como saída uma probabilidade (uma camada de unidade única com uma ativação sigmóide), você usará a função de perda `losses.BinaryCrossentropy`.\n", + "\n", + "Agora, configure o modelo para usar um otimizador e uma função de perda:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### Treine o modelo\n", + "\n", + "Você passará o objeto `dataset` ao método fit para treinar o modelo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### Avalie o modelo\n", + "\n", + "Vamos conferir o desempenho do modelo. Serão retornados dois valores: perda (um número que representa o erro; quanto menor, melhor) e exatidão." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "Essa estratégia bem simples atinge uma exatidão de cerca de 86%." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### Crie um gráfico de exatidão e perda ao longo do tempo\n", + "\n", + "`model.fit()` retorna um objeto `History` que contém um dicionário com tudo o que aconteceu durante o treinamento:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "Há quatro entradas: uma para cada métrica monitorada durante o treinamento e a validação. Você usará esses valores para plotar a perda do treinamento e da validação para fins comparativos, além da exatidão do treinamento e da validação:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "Neste gráfico, os pontos representam a perda e exatidão do treinamento, enquanto as linhas sólidas representam a perda e exatidão da validação.\n", + "\n", + "Observe que a perda do treinamento *diminui* a cada época, e a exatidão do treinamento *aumenta* a cada época. Isso é o esperado ao usar uma otimização do método do gradiente descendente, que deve minimizar a quantidade desejada em cada iteração.\n", + "\n", + "Esse não é o caso para a perda e exatidão de validação, que parecem atingir o pico antes da exatidão do treinamento. Este é um exemplo de overfitting: o modelo tem desempenho melhor com os dados de treinamento em comparação a dados nunca vistos antes. Após esse ponto, o modelo sofre uma sobreotimização e aprende representações *específicas* dos dados de treinamento que não oferecem boas *generalizações* para os dados de teste.\n", + "\n", + "Para este caso específico, é possível evitar o overfitting simplesmente parando o treinamento quando a exatidão da validação deixa de aumentar. Uma forma de fazer isso é usando o callback `tf.keras.callbacks.EarlyStopping`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## Exporte o modelo\n", + "\n", + "No código acima, você aplicou a camada `TextVectorization` ao dataset antes de alimentar o modelo com texto. Se quiser tornar o modelo capaz de processar strings brutas (por exemplo, para simplificar a implantação), é possível incluir a camada `TextVectorization` dentro do modelo. Para isso, você pode criar um novo modelo usando os pesos que acabou de treinar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### Inferência de dados novos\n", + "\n", + "Para fazer previsões para novos exemplos, basta chamar `model.predict()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "Ao incluir a lógica de pré-processamento de texto dentro do modelo, você pode exportar um modelo para produção que simplifica a implantação e reduz o potencial de [desvio de treinamento/teste](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).\n", + "\n", + "Há uma diferença de desempenho que você deve considerar ao escolher onde aplicar a camada TextVectorization. Ao usá-la fora do modelo, você pode fazer o processamento assíncrono na CPU e armazenar os dados em buffer ao treinar na GPU. Portanto, se você estiver treinando seu modelo na GPU, deve escolher essa opção para obter o melhor desempenho ao desenvolver o modelo. Depois, quando você estiver pronto para preparar a implantação, inclua a camada TextVectorization dentro do modelo.\n", + "\n", + "Confira este [tutorial](https://www.tensorflow.org/tutorials/keras/save_and_load) para saber mais sobre como salvar modelos." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## Exercício: classificação multiclasse para perguntas do Stack Overflow\n", + "\n", + "Este tutorial mostrou como treinar um classificador binário do zero usando o dataset do IMDB. Você pode fazer um exercício: modifique este notebook para treinar um classificador multiclasse que preveja a tag de uma pergunta de programação feita no [Stack Overflow](http://stackoverflow.com/).\n", + "\n", + "Um [dataset](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz) foi preparado para uso, contendo o texto de milhares de perguntas de programação (por exemplo, \"Como posso ordenar um dicionário por valor no Python?\") publicadas no Stack Overflow. Cada pergunta é rotulada com exatamente uma tag (Python, CSharp, JavaScript ou Java). Sua tarefa é receber uma pergunta como entrada e prever a tag apropriada, que, neste caso, é Python.\n", + "\n", + "Você usará um dataset que contém milhares de perguntas extraídas do dataset público do Stack Overflow, que é bem maior, no [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow), contendo mais de 17 milhões de publicações.\n", + "\n", + "Após baixar o dataset, você verá que ele tem uma estrutura de diretórios similar ao dataset do IMDB utilizado anteriormente:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "Observação: para aumentar a dificuldade do problema de classificação, as ocorrências das palavras Python, CSharp, JavaScript e Java nas perguntas de programação foram substituídas pela palavra *blank* (em branco), já que diversas perguntas contêm a linguagem de programação em questão.\n", + "\n", + "Para fazer este exercício, você deve modificar este notebook para que funcione com o dataset do Stack Overflow das seguintes maneiras:\n", + "\n", + "1. Na parte superior do notebook, atualize o código que baixa o dataset do IMDB com o código que baixa o [dataset do Stack Overflow](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz), que já foi preparado. Como o dataset do Stack Overflow tem uma estrutura de diretórios parecida, você não precisará fazer muitas modificações.\n", + "\n", + "2. Modifique a última camada do modelo para `Dense(4)`, pois agora há quatro classes de saída.\n", + "\n", + "3. Ao compilar o modelo, altere a perda para `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`. Esta é a função de perda correta a ser usada para problemas de classificação muilticlasse, quando os rótulos de cada classe são inteiros (neste caso, podem ser 0, *1*, *2* ou *3*). Além disso, altere as métricas para `metrics=['accuracy']`, já que este é um problema de classificação multicasse (`tf.metrics.BinaryAccuracy` é usado somente para classificadores binários).\n", + "\n", + "4. Ao plotar a precisão ao longo do tempo, altere `binary_accuracy` e `val_binary_accuracy` para `accuracy` e `val_accuracy`, respectivamente.\n", + "\n", + "5. Após fazer essas alterações, você poderá treinar um classificador multiclasse. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## Saiba mais\n", + "\n", + "Este tutorial mostrou como fazer a classificação de texto do zero. Para saber mais sobre o workflow de classificação de texto de forma geral, confira o [guia Classificação de texto](https://developers.google.com/machine-learning/guides/text-classification/) no Google Developers.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/site/zh-cn/tutorials/keras/text_classification.ipynb b/site/zh-cn/tutorials/keras/text_classification.ipynb index 4d42c2cfac..a9beeea6ec 100644 --- a/site/zh-cn/tutorials/keras/text_classification.ipynb +++ b/site/zh-cn/tutorials/keras/text_classification.ipynb @@ -1,974 +1,974 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Ic4_occAAiAT" - }, - "source": [ - "##### Copyright 2019 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ioaprt5q5US7" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "yCl0eTNH5RS3" - }, - "outputs": [], - "source": [ - "#@title MIT License\n", - "#\n", - "# Copyright (c) 2017 François Chollet\n", - "#\n", - "# Permission is hereby granted, free of charge, to any person obtaining a\n", - "# copy of this software and associated documentation files (the \"Software\"),\n", - "# to deal in the Software without restriction, including without limitation\n", - "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", - "# and/or sell copies of the Software, and to permit persons to whom the\n", - "# Software is furnished to do so, subject to the following conditions:\n", - "#\n", - "# The above copyright notice and this permission notice shall be included in\n", - "# all copies or substantial portions of the Software.\n", - "#\n", - "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", - "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", - "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", - "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", - "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", - "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", - "# DEALINGS IN THE SOFTWARE." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ItXfxkxvosLH" - }, - "source": [ - "# 电影评论文本分类" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hKY4XMc9o8iB" - }, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - "
在 TensorFlow.org 上查看 在 Google Colab 中运行 在 GitHub 上查看源代码 下载笔记本
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Eg62Pmz3o83v" - }, - "source": [ - "本教程演示了从存储在磁盘上的纯文本文件开始的文本分类。您将训练一个二元分类器对 IMDB 数据集执行情感分析。在笔记本的最后,有一个练习供您尝试,您将在其中训练一个多类分类器来预测 Stack Overflow 上编程问题的标签。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8RZOuS9LWQvv" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import os\n", - "import re\n", - "import shutil\n", - "import string\n", - "import tensorflow as tf\n", - "\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras import losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6-tTFS04dChr" - }, - "outputs": [], - "source": [ - "print(tf.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NBTI1bi8qdFV" - }, - "source": [ - "## 情感分析\n", - "\n", - "此笔记本训练了一个情感分析模型,利用评论文本将电影评论分类为*正面*或*负面*评价。这是一个*二元*(或二类)分类示例,也是一个重要且应用广泛的机器学习问题。\n", - "\n", - "您将使用 [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/),其中包含 [Internet Movie Database](https://www.imdb.com/) 中的 50,000 条电影评论文本 。我们将这些评论分为两组,其中 25,000 条用于训练,另外 25,000 条用于测试。训练集和测试集是*均衡的*,也就是说其中包含相等数量的正面评价和负面评价。\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iAsKG535pHep" - }, - "source": [ - "### 下载并探索 IMDB 数据集\n", - "\n", - "我们下载并提取数据集,然后浏览一下目录结构。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k7ZYnuajVlFN" - }, - "outputs": [], - "source": [ - "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", - "\n", - "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", - " untar=True, cache_dir='.',\n", - " cache_subdir='')\n", - "\n", - "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "355CfOvsV1pl" - }, - "outputs": [], - "source": [ - "os.listdir(dataset_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7ASND15oXpF1" - }, - "outputs": [], - "source": [ - "train_dir = os.path.join(dataset_dir, 'train')\n", - "os.listdir(train_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ysMNMI1CWDFD" - }, - "source": [ - "`aclImdb/train/pos` 和 `aclImdb/train/neg` 目录包含许多文本文件,每个文件都是一条电影评论。我们来看看其中的一条评论。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R7g8hFvzWLIZ" - }, - "outputs": [], - "source": [ - "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", - "with open(sample_file) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mk20TEm6ZRFP" - }, - "source": [ - "### 加载数据集\n", - "\n", - "接下来,您将从磁盘加载数据并将其准备为适合训练的格式。为此,您将使用有用的 [text_dataset_from_directory](https://tensorflow.google.cn/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 实用工具,它期望的目录结构如下所示。\n", - "\n", - "```\n", - "main_directory/\n", - "...class_a/\n", - "......a_text_1.txt\n", - "......a_text_2.txt\n", - "...class_b/\n", - "......b_text_1.txt\n", - "......b_text_2.txt\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nQauv38Lnok3" - }, - "source": [ - "要准备用于二元分类的数据集,磁盘上需要有两个文件夹,分别对应于 `class_a` 和 `class_b`。这些将是正面和负面的电影评论,可以在 `aclImdb/train/pos` 和 `aclImdb/train/neg` 中找到。由于 IMDB 数据集包含其他文件夹,因此您需要在使用此实用工具之前将其移除。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VhejsClzaWfl" - }, - "outputs": [], - "source": [ - "remove_dir = os.path.join(train_dir, 'unsup')\n", - "shutil.rmtree(remove_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "95kkUdRoaeMw" - }, - "source": [ - "接下来,您将使用 `text_dataset_from_directory` 实用工具创建带标签的 `tf.data.Dataset`。[tf.data](https://tensorflow.google.cn/guide/data) 是一组强大的数据处理工具。\n", - "\n", - "运行机器学习实验时,最佳做法是将数据集拆成三份:[训练](https://developers.google.com/machine-learning/glossary#training_set)、[验证](https://developers.google.com/machine-learning/glossary#validation_set) 和 [测试](https://developers.google.com/machine-learning/glossary#test-set)。\n", - "\n", - "IMDB 数据集已经分成训练集和测试集,但缺少验证集。我们来通过下面的 `validation_split` 参数,使用 80:20 拆分训练数据来创建验证集。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nOrK-MTYaw3C" - }, - "outputs": [], - "source": [ - "batch_size = 32\n", - "seed = 42\n", - "\n", - "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='training', \n", - " seed=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5Y33oxOUpYkh" - }, - "source": [ - "如上所示,训练文件夹中有 25,000 个样本,您将使用其中的 80%(或 20,000 个)进行训练。稍后您将看到,您可以通过将数据集直接传递给 `model.fit` 来训练模型。如果您不熟悉 `tf.data`,还可以遍历数据集并打印出一些样本,如下所示。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51wNaPPApk1K" - }, - "outputs": [], - "source": [ - "for text_batch, label_batch in raw_train_ds.take(1):\n", - " for i in range(3):\n", - " print(\"Review\", text_batch.numpy()[i])\n", - " print(\"Label\", label_batch.numpy()[i])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWq1SUIrp1a-" - }, - "source": [ - "请注意,评论包含原始文本(带有标点符号和偶尔出现的 HTML 代码,如 `
`)。我们将在以下部分展示如何处理这些问题。\n", - "\n", - "标签为 0 或 1。要查看它们与正面和负面电影评论的对应关系,可以查看数据集上的 `class_names` 属性。\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MlICTG8spyO2" - }, - "outputs": [], - "source": [ - "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", - "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pbdO39vYqdJr" - }, - "source": [ - "接下来,您将创建验证数据集和测试数据集。您将使用训练集中剩余的 5,000 条评论进行验证。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SzxazN8Hq1pF" - }, - "source": [ - "注:使用 `validation_split` 和 `subset` 参数时,请确保要么指定随机种子,要么传递 `shuffle=False`,这样验证拆分和训练拆分就不会重叠。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JsMwwhOoqjKF" - }, - "outputs": [], - "source": [ - "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/train', \n", - " batch_size=batch_size, \n", - " validation_split=0.2, \n", - " subset='validation', \n", - " seed=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rdSr0Nt3q_ns" - }, - "outputs": [], - "source": [ - "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", - " 'aclImdb/test', \n", - " batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qJmTiO0IYAjm" - }, - "source": [ - "### 准备用于训练的数据集\n", - "\n", - "接下来,您将使用有用的 `tf.keras.layers.TextVectorization` 层对数据进行标准化、词例化和向量化。\n", - "\n", - "标准化是指对文本进行预处理,通常是移除标点符号或 HTML 元素以简化数据集。词例化是指将字符串分割成词例(例如,通过空格将句子分割成单个单词)。向量化是指将词例转换为数字,以便将它们输入神经网络。所有这些任务都可以通过这个层完成。\n", - "\n", - "正如您在上面看到的,评论包含各种 HTML 代码,例如 `
`。`TextVectorization` 层(默认情况下会将文本转换为小写并去除标点符号,但不会去除 HTML)中的默认标准化程序不会移除这些代码。您将编写一个自定义标准化函数来移除 HTML。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVcHl-SLrH-u" - }, - "source": [ - "注:为了防止[训练-测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(也称为训练-应用偏差),在训练和测试时间对数据进行相同的预处理非常重要。为此,可以将 `TextVectorization` 层直接包含在模型中,如本教程后面所示。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SDRI_s_tX1Hk" - }, - "outputs": [], - "source": [ - "def custom_standardization(input_data):\n", - " lowercase = tf.strings.lower(input_data)\n", - " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", - " return tf.strings.regex_replace(stripped_html,\n", - " '[%s]' % re.escape(string.punctuation),\n", - " '')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d2d3Aw8dsUux" - }, - "source": [ - "
接下来,您将创建一个 `TextVectorization` 层。您将使用该层对我们的数据进行标准化、词例化和向量化。您将 `output_mode` 设置为 `int` 以便为每个词例创建唯一的整数索引。\n", - "\n", - "请注意,您使用的是默认拆分函数,以及您在上面定义的自定义标准化函数。您还将为模型定义一些常量,例如显式的最大 `sequence_length`,这会使层将序列填充或截断为精确的 `sequence_length` 值。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-c76RvSzsMnX" - }, - "outputs": [], - "source": [ - "max_features = 10000\n", - "sequence_length = 250\n", - "\n", - "vectorize_layer = layers.TextVectorization(\n", - " standardize=custom_standardization,\n", - " max_tokens=max_features,\n", - " output_mode='int',\n", - " output_sequence_length=sequence_length)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vlFOpfF6scT6" - }, - "source": [ - "接下来,您将调用 `adapt` 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAhdjK7AtroA" - }, - "source": [ - "注:在调用时请务必仅使用您的训练数据(使用测试集会泄漏信息)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH4_2ZGJsa_X" - }, - "outputs": [], - "source": [ - "# Make a text-only dataset (without labels), then call adapt\n", - "train_text = raw_train_ds.map(lambda x, y: x)\n", - "vectorize_layer.adapt(train_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SHQVEFzNt-K_" - }, - "source": [ - "我们来创建一个函数来查看使用该层预处理一些数据的结果。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SCIg_T50wOCU" - }, - "outputs": [], - "source": [ - "def vectorize_text(text, label):\n", - " text = tf.expand_dims(text, -1)\n", - " return vectorize_layer(text), label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XULcm6B3xQIO" - }, - "outputs": [], - "source": [ - "# retrieve a batch (of 32 reviews and labels) from the dataset\n", - "text_batch, label_batch = next(iter(raw_train_ds))\n", - "first_review, first_label = text_batch[0], label_batch[0]\n", - "print(\"Review\", first_review)\n", - "print(\"Label\", raw_train_ds.class_names[first_label])\n", - "print(\"Vectorized review\", vectorize_text(first_review, first_label))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6u5EX0hxyNZT" - }, - "source": [ - "正如您在上面看到的,每个词例都被一个整数替换了。您可以通过在该层上调用 `.get_vocabulary()` 来查找每个整数对应的词例(字符串)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kRq9hTQzhVhW" - }, - "outputs": [], - "source": [ - "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", - "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", - "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XD2H6utRydGv" - }, - "source": [ - "你几乎已经准备好训练你的模型了。作为最后的预处理步骤,你将在训练、验证和测试数据集上应用之前创建的TextVectorization层。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2zhmpeViI1iG" - }, - "outputs": [], - "source": [ - "train_ds = raw_train_ds.map(vectorize_text)\n", - "val_ds = raw_val_ds.map(vectorize_text)\n", - "test_ds = raw_test_ds.map(vectorize_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsVQyPMizjuO" - }, - "source": [ - "### 配置数据集以提高性能\n", - "\n", - "以下是加载数据时应该使用的两种重要方法,以确保 I/O 不会阻塞。\n", - "\n", - "从磁盘加载后,`.cache()` 会将数据保存在内存中。这将确保数据集在训练模型时不会成为瓶颈。如果您的数据集太大而无法放入内存,也可以使用此方法创建高性能的磁盘缓存,这比许多小文件的读取效率更高。\n", - "\n", - "`prefetch()` 会在训练时将数据预处理和模型执行重叠。\n", - "\n", - "您可以在[数据性能指南](https://tensorflow.google.cn/guide/data_performance)中深入了解这两种方法,以及如何将数据缓存到磁盘。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wMcs_H7izm5m" - }, - "outputs": [], - "source": [ - "AUTOTUNE = tf.data.AUTOTUNE\n", - "\n", - "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", - "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LLC02j2g-llC" - }, - "source": [ - "### 创建模型\n", - "\n", - "是时候创建您的神经网络了:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkQP6in8yUBR" - }, - "outputs": [], - "source": [ - "embedding_dim = 16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xpKOoWgu-llD" - }, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " layers.Embedding(max_features + 1, embedding_dim),\n", - " layers.Dropout(0.2),\n", - " layers.GlobalAveragePooling1D(),\n", - " layers.Dropout(0.2),\n", - " layers.Dense(1)])\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6PbKQ6mucuKL" - }, - "source": [ - "层按顺序堆叠以构建分类器:\n", - "\n", - "1. 第一个层是 `Embedding` 层。此层采用整数编码的评论,并查找每个单词索引的嵌入向量。这些向量是通过模型训练学习到的。向量向输出数组增加了一个维度。得到的维度为:`(batch, sequence, embedding)`。要详细了解嵌入向量,请参阅[单词嵌入向量](https://tensorflow.google.cn/text/guide/word_embeddings)教程。\n", - "2. 接下来,`GlobalAveragePooling1D` 将通过对序列维度求平均值来为每个样本返回一个定长输出向量。这允许模型以尽可能最简单的方式处理变长输入。\n", - "3. 最后一层与单个输出结点密集连接。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L4EqVWg4-llM" - }, - "source": [ - "### 损失函数与优化器\n", - "\n", - "模型训练需要一个损失函数和一个优化器。由于这是一个二元分类问题,并且模型输出概率(具有 Sigmoid 激活的单一单元层),我们将使用 `losses.BinaryCrossentropy` 损失函数。\n", - "\n", - "现在,配置模型以使用优化器和损失函数:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mr0GP-cQ-llN" - }, - "outputs": [], - "source": [ - "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", - " optimizer='adam',\n", - " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "35jv_fzP-llU" - }, - "source": [ - "### 训练模型\n", - "\n", - "将 `dataset` 对象传递给 fit 方法,对模型进行训练。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tXSGrjWZ-llW" - }, - "outputs": [], - "source": [ - "epochs = 10\n", - "history = model.fit(\n", - " train_ds,\n", - " validation_data=val_ds,\n", - " epochs=epochs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9EEGuDVuzb5r" - }, - "source": [ - "### 评估模型\n", - "\n", - "我们来看一下模型的性能如何。将返回两个值。损失值(loss)(一个表示误差的数字,值越低越好)与准确率(accuracy)。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zOMKywn4zReN" - }, - "outputs": [], - "source": [ - "loss, accuracy = model.evaluate(test_ds)\n", - "\n", - "print(\"Loss: \", loss)\n", - "print(\"Accuracy: \", accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1iEXVTR0Z2t" - }, - "source": [ - "这种十分简单的方式实现了约 86% 的准确率。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ldbQqCw2Xc1W" - }, - "source": [ - "### 创建准确率和损失随时间变化的图表\n", - "\n", - "`model.fit()` 会返回包含一个字典的 `History` 对象。该字典包含训练过程中产生的所有信息:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-YcvZsdvWfDf" - }, - "outputs": [], - "source": [ - "history_dict = history.history\n", - "history_dict.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1_CH32qJXruI" - }, - "source": [ - "其中有四个条目:每个条目代表训练和验证过程中的一项监测指标。您可以使用这些指标来绘制用于比较的训练损失和验证损失图表,以及训练准确率和验证准确率图表:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2SEMeQ5YXs8z" - }, - "outputs": [], - "source": [ - "acc = history_dict['binary_accuracy']\n", - "val_acc = history_dict['val_binary_accuracy']\n", - "loss = history_dict['loss']\n", - "val_loss = history_dict['val_loss']\n", - "\n", - "epochs = range(1, len(acc) + 1)\n", - "\n", - "# \"bo\" is for \"blue dot\"\n", - "plt.plot(epochs, loss, 'bo', label='Training loss')\n", - "# b is for \"solid blue line\"\n", - "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", - "plt.title('Training and validation loss')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Loss')\n", - "plt.legend()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z3PJemLPXwz_" - }, - "outputs": [], - "source": [ - "plt.plot(epochs, acc, 'bo', label='Training acc')\n", - "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", - "plt.title('Training and validation accuracy')\n", - "plt.xlabel('Epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.legend(loc='lower right')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFFyCuJoXy7r" - }, - "source": [ - "在该图表中,虚线代表训练损失和准确率,实线代表验证损失和准确率。\n", - "\n", - "请注意,训练损失会逐周期*下降*,而训练准确率则逐周期*上升*。使用梯度下降优化时,这是预期结果,它应该在每次迭代中最大限度减少所需的数量。\n", - "\n", - "但是,对于验证损失和准确率来说则不然——它们似乎会在训练转确率之前达到顶点。这是过拟合的一个例子:模型在训练数据上的表现要好于在之前从未见过的数据上的表现。经过这一点之后,模型会过度优化和学习*特定*于训练数据的表示,但无法*泛化*到测试数据。\n", - "\n", - "对于这种特殊情况,您可以通过在验证准确率不再增加时直接停止训练来防止过度拟合。一种方式是使用 `tf.keras.callbacks.EarlyStopping` 回调。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-to23J3Vy5d3" - }, - "source": [ - "## 导出模型\n", - "\n", - "在上面的代码中,您在向模型馈送文本之前对数据集应用了 `TextVectorization`。 如果您想让模型能够处理原始字符串(例如,为了简化部署),您可以在模型中包含 `TextVectorization` 层。为此,您可以使用刚刚训练的权重创建一个新模型。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWXsMvryuZuq" - }, - "outputs": [], - "source": [ - "export_model = tf.keras.Sequential([\n", - " vectorize_layer,\n", - " model,\n", - " layers.Activation('sigmoid')\n", - "])\n", - "\n", - "export_model.compile(\n", - " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", - ")\n", - "\n", - "# Test it with `raw_test_ds`, which yields raw strings\n", - "loss, accuracy = export_model.evaluate(raw_test_ds)\n", - "print(accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TwQgoN88LoEF" - }, - "source": [ - "### 使用新数据进行推断\n", - "\n", - "要获得对新样本的预测,只需调用 `model.predict()` 即可。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QW355HH5L49K" - }, - "outputs": [], - "source": [ - "examples = [\n", - " \"The movie was great!\",\n", - " \"The movie was okay.\",\n", - " \"The movie was terrible...\"\n", - "]\n", - "\n", - "export_model.predict(examples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MaxlpFWpzR6c" - }, - "source": [ - "将文本预处理逻辑包含在模型中后,您可以导出用于生产的模型,从而简化部署并降低[训练/测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)的可能性。\n", - "\n", - "在选择应用 TextVectorization 层的位置时,需要注意性能差异。在模型之外使用它可以让您在 GPU 上训练时进行异步 CPU 处理和数据缓冲。因此,如果您在 GPU 上训练模型,您应该在开发模型时使用此选项以获得最佳性能,然后在准备好部署时进行切换,在模型中包含 TextVectorization 层。\n", - "\n", - "请参阅此[教程](https://tensorflow.google.cn/tutorials/keras/save_and_load),详细了解如何保存模型。" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eSSuci_6nCEG" - }, - "source": [ - "## 练习:对 Stack Overflow 问题进行多类分类\n", - "\n", - "本教程展示了如何在 IMDB 数据集上从头开始训练二元分类器。作为练习,您可以修改此笔记本以训练多类分类器来预测 [Stack Overflow](http://stackoverflow.com/) 上的编程问题的标签。\n", - "\n", - "我们已经准备好了一个[数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)供您使用,其中包含了几千个发布在 Stack Overflow 上的编程问题(例如,\"How can sort a dictionary by value in Python?\")。每一个问题都只有一个标签(Python、CSharp、JavaScript 或 Java)。您的任务是将问题作为输入,并预测适当的标签,在本例中为 Python。\n", - "\n", - "您将使用的数据集包含从 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) 上更大的公共 Stack Overflow 数据集提取的数千个问题,其中包含超过 1700 万个帖子。\n", - "\n", - "下载数据集后,您会发现它与您之前使用的 IMDB 数据集具有相似的目录结构:\n", - "\n", - "```\n", - "train/\n", - "...python/\n", - "......0.txt\n", - "......1.txt\n", - "...javascript/\n", - "......0.txt\n", - "......1.txt\n", - "...csharp/\n", - "......0.txt\n", - "......1.txt\n", - "...java/\n", - "......0.txt\n", - "......1.txt\n", - "```\n", - "\n", - "注:为了增加分类问题的难度,编程问题中出现的 Python、CSharp、JavaScript 或 Java 等词已被替换为 *blank*(因为许多问题都包含它们所涉及的语言)。\n", - "\n", - "要完成此练习,您应该对此笔记本进行以下修改以使用 Stack Overflow 数据集:\n", - "\n", - "1. 在笔记本顶部,将下载 IMDB 数据集的代码更新为下载前面准备好的 [Stack Overflow 数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)的代码。由于 Stack Overflow 数据集具有类似的目录结构,因此您不需要进行太多修改。\n", - "\n", - "2. 将模型的最后一层修改为 `Dense(4)`,因为现在有四个输出类。\n", - "\n", - "3. 编译模型时,将损失更改为 `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`。当每个类的标签是整数(在本例中,它们可以是 0、*1*、*2* 或 *3*)时,这是用于多类分类问题的正确损失函数。 此外,将指标更改为 `metrics=['accuracy']`,因为这是一个多类分类问题(`tf.metrics.BinaryAccuracy` 仅用于二元分类器 )。\n", - "\n", - "4. 在绘制随时间变化的准确率时,请将 `binary_accuracy` 和 `val_binary_accuracy` 分别更改为 `accuracy` 和 `val_accuracy`。\n", - "\n", - "5. 完成这些更改后,就可以训练多类分类器了。 " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F0T5SIwSm7uc" - }, - "source": [ - "## 了解更多信息\n", - "\n", - "本教程从头开始介绍了文本分类。要详细了解一般的文本分类工作流程,请查看 Google Developers 提供的[文本分类指南](https://developers.google.com/machine-learning/guides/text-classification/)。\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "text_classification.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ic4_occAAiAT" + }, + "source": [ + "##### Copyright 2019 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ioaprt5q5US7" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yCl0eTNH5RS3" + }, + "outputs": [], + "source": [ + "#@title MIT License\n", + "#\n", + "# Copyright (c) 2017 François Chollet\n", + "#\n", + "# Permission is hereby granted, free of charge, to any person obtaining a\n", + "# copy of this software and associated documentation files (the \"Software\"),\n", + "# to deal in the Software without restriction, including without limitation\n", + "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n", + "# and/or sell copies of the Software, and to permit persons to whom the\n", + "# Software is furnished to do so, subject to the following conditions:\n", + "#\n", + "# The above copyright notice and this permission notice shall be included in\n", + "# all copies or substantial portions of the Software.\n", + "#\n", + "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", + "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", + "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n", + "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", + "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n", + "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n", + "# DEALINGS IN THE SOFTWARE." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ItXfxkxvosLH" + }, + "source": [ + "# 电影评论文本分类" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hKY4XMc9o8iB" + }, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
在 TensorFlow.org 上查看 在 Google Colab 中运行 在 GitHub 上查看源代码 下载笔记本
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Eg62Pmz3o83v" + }, + "source": [ + "本教程演示了从存储在磁盘上的纯文本文件开始的文本分类。您将训练一个二元分类器对 IMDB 数据集执行情感分析。在笔记本的最后,有一个练习供您尝试,您将在其中训练一个多类分类器来预测 Stack Overflow 上编程问题的标签。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8RZOuS9LWQvv" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import os\n", + "import re\n", + "import shutil\n", + "import string\n", + "import tensorflow as tf\n", + "\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras import losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6-tTFS04dChr" + }, + "outputs": [], + "source": [ + "print(tf.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NBTI1bi8qdFV" + }, + "source": [ + "## 情感分析\n", + "\n", + "此笔记本训练了一个情感分析模型,利用评论文本将电影评论分类为*正面*或*负面*评价。这是一个*二元*(或二类)分类示例,也是一个重要且应用广泛的机器学习问题。\n", + "\n", + "您将使用 [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/),其中包含 [Internet Movie Database](https://www.imdb.com/) 中的 50,000 条电影评论文本 。我们将这些评论分为两组,其中 25,000 条用于训练,另外 25,000 条用于测试。训练集和测试集是*均衡的*,也就是说其中包含相等数量的正面评价和负面评价。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAsKG535pHep" + }, + "source": [ + "### 下载并探索 IMDB 数据集\n", + "\n", + "我们下载并提取数据集,然后浏览一下目录结构。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k7ZYnuajVlFN" + }, + "outputs": [], + "source": [ + "url = \"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n", + "\n", + "dataset = tf.keras.utils.get_file(\"aclImdb_v1\", url,\n", + " untar=True, cache_dir='.',\n", + " cache_subdir='')\n", + "\n", + "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "355CfOvsV1pl" + }, + "outputs": [], + "source": [ + "os.listdir(dataset_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ASND15oXpF1" + }, + "outputs": [], + "source": [ + "train_dir = os.path.join(dataset_dir, 'train')\n", + "os.listdir(train_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysMNMI1CWDFD" + }, + "source": [ + "`aclImdb/train/pos` 和 `aclImdb/train/neg` 目录包含许多文本文件,每个文件都是一条电影评论。我们来看看其中的一条评论。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R7g8hFvzWLIZ" + }, + "outputs": [], + "source": [ + "sample_file = os.path.join(train_dir, 'pos/1181_9.txt')\n", + "with open(sample_file) as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mk20TEm6ZRFP" + }, + "source": [ + "### 加载数据集\n", + "\n", + "接下来,您将从磁盘加载数据并将其准备为适合训练的格式。为此,您将使用有用的 [text_dataset_from_directory](https://tensorflow.google.cn/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory) 实用工具,它期望的目录结构如下所示。\n", + "\n", + "```\n", + "main_directory/\n", + "...class_a/\n", + "......a_text_1.txt\n", + "......a_text_2.txt\n", + "...class_b/\n", + "......b_text_1.txt\n", + "......b_text_2.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQauv38Lnok3" + }, + "source": [ + "要准备用于二元分类的数据集,磁盘上需要有两个文件夹,分别对应于 `class_a` 和 `class_b`。这些将是正面和负面的电影评论,可以在 `aclImdb/train/pos` 和 `aclImdb/train/neg` 中找到。由于 IMDB 数据集包含其他文件夹,因此您需要在使用此实用工具之前将其移除。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VhejsClzaWfl" + }, + "outputs": [], + "source": [ + "remove_dir = os.path.join(train_dir, 'unsup')\n", + "shutil.rmtree(remove_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "95kkUdRoaeMw" + }, + "source": [ + "接下来,您将使用 `text_dataset_from_directory` 实用工具创建带标签的 `tf.data.Dataset`。[tf.data](https://tensorflow.google.cn/guide/data) 是一组强大的数据处理工具。\n", + "\n", + "运行机器学习实验时,最佳做法是将数据集拆成三份:[训练](https://developers.google.com/machine-learning/glossary#training_set)、[验证](https://developers.google.com/machine-learning/glossary#validation_set) 和 [测试](https://developers.google.com/machine-learning/glossary#test-set)。\n", + "\n", + "IMDB 数据集已经分成训练集和测试集,但缺少验证集。我们来通过下面的 `validation_split` 参数,使用 80:20 拆分训练数据来创建验证集。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOrK-MTYaw3C" + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "seed = 42\n", + "\n", + "raw_train_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='training', \n", + " seed=seed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Y33oxOUpYkh" + }, + "source": [ + "如上所示,训练文件夹中有 25,000 个样本,您将使用其中的 80%(或 20,000 个)进行训练。稍后您将看到,您可以通过将数据集直接传递给 `model.fit` 来训练模型。如果您不熟悉 `tf.data`,还可以遍历数据集并打印出一些样本,如下所示。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51wNaPPApk1K" + }, + "outputs": [], + "source": [ + "for text_batch, label_batch in raw_train_ds.take(1):\n", + " for i in range(3):\n", + " print(\"Review\", text_batch.numpy()[i])\n", + " print(\"Label\", label_batch.numpy()[i])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JWq1SUIrp1a-" + }, + "source": [ + "请注意,评论包含原始文本(带有标点符号和偶尔出现的 HTML 代码,如 `
`)。我们将在以下部分展示如何处理这些问题。\n", + "\n", + "标签为 0 或 1。要查看它们与正面和负面电影评论的对应关系,可以查看数据集上的 `class_names` 属性。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MlICTG8spyO2" + }, + "outputs": [], + "source": [ + "print(\"Label 0 corresponds to\", raw_train_ds.class_names[0])\n", + "print(\"Label 1 corresponds to\", raw_train_ds.class_names[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pbdO39vYqdJr" + }, + "source": [ + "接下来,您将创建验证数据集和测试数据集。您将使用训练集中剩余的 5,000 条评论进行验证。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SzxazN8Hq1pF" + }, + "source": [ + "注:使用 `validation_split` 和 `subset` 参数时,请确保要么指定随机种子,要么传递 `shuffle=False`,这样验证拆分和训练拆分就不会重叠。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JsMwwhOoqjKF" + }, + "outputs": [], + "source": [ + "raw_val_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/train', \n", + " batch_size=batch_size, \n", + " validation_split=0.2, \n", + " subset='validation', \n", + " seed=seed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rdSr0Nt3q_ns" + }, + "outputs": [], + "source": [ + "raw_test_ds = tf.keras.utils.text_dataset_from_directory(\n", + " 'aclImdb/test', \n", + " batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJmTiO0IYAjm" + }, + "source": [ + "### 准备用于训练的数据集\n", + "\n", + "接下来,您将使用有用的 `tf.keras.layers.TextVectorization` 层对数据进行标准化、词例化和向量化。\n", + "\n", + "标准化是指对文本进行预处理,通常是移除标点符号或 HTML 元素以简化数据集。词例化是指将字符串分割成词例(例如,通过空格将句子分割成单个单词)。向量化是指将词例转换为数字,以便将它们输入神经网络。所有这些任务都可以通过这个层完成。\n", + "\n", + "正如您在上面看到的,评论包含各种 HTML 代码,例如 `
`。`TextVectorization` 层(默认情况下会将文本转换为小写并去除标点符号,但不会去除 HTML)中的默认标准化程序不会移除这些代码。您将编写一个自定义标准化函数来移除 HTML。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZVcHl-SLrH-u" + }, + "source": [ + "注:为了防止[训练-测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)(也称为训练-应用偏差),在训练和测试时间对数据进行相同的预处理非常重要。为此,可以将 `TextVectorization` 层直接包含在模型中,如本教程后面所示。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SDRI_s_tX1Hk" + }, + "outputs": [], + "source": [ + "def custom_standardization(input_data):\n", + " lowercase = tf.strings.lower(input_data)\n", + " stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')\n", + " return tf.strings.regex_replace(stripped_html,\n", + " '[%s]' % re.escape(string.punctuation),\n", + " '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2d3Aw8dsUux" + }, + "source": [ + "
接下来,您将创建一个 `TextVectorization` 层。您将使用该层对我们的数据进行标准化、词例化和向量化。您将 `output_mode` 设置为 `int` 以便为每个词例创建唯一的整数索引。\n", + "\n", + "请注意,您使用的是默认拆分函数,以及您在上面定义的自定义标准化函数。您还将为模型定义一些常量,例如显式的最大 `sequence_length`,这会使层将序列填充或截断为精确的 `sequence_length` 值。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-c76RvSzsMnX" + }, + "outputs": [], + "source": [ + "max_features = 10000\n", + "sequence_length = 250\n", + "\n", + "vectorize_layer = layers.TextVectorization(\n", + " standardize=custom_standardization,\n", + " max_tokens=max_features,\n", + " output_mode='int',\n", + " output_sequence_length=sequence_length)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vlFOpfF6scT6" + }, + "source": [ + "接下来,您将调用 `adapt` 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAhdjK7AtroA" + }, + "source": [ + "注:在调用时请务必仅使用您的训练数据(使用测试集会泄漏信息)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH4_2ZGJsa_X" + }, + "outputs": [], + "source": [ + "# Make a text-only dataset (without labels), then call adapt\n", + "train_text = raw_train_ds.map(lambda x, y: x)\n", + "vectorize_layer.adapt(train_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQVEFzNt-K_" + }, + "source": [ + "我们来创建一个函数来查看使用该层预处理一些数据的结果。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SCIg_T50wOCU" + }, + "outputs": [], + "source": [ + "def vectorize_text(text, label):\n", + " text = tf.expand_dims(text, -1)\n", + " return vectorize_layer(text), label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XULcm6B3xQIO" + }, + "outputs": [], + "source": [ + "# retrieve a batch (of 32 reviews and labels) from the dataset\n", + "text_batch, label_batch = next(iter(raw_train_ds))\n", + "first_review, first_label = text_batch[0], label_batch[0]\n", + "print(\"Review\", first_review)\n", + "print(\"Label\", raw_train_ds.class_names[first_label])\n", + "print(\"Vectorized review\", vectorize_text(first_review, first_label))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6u5EX0hxyNZT" + }, + "source": [ + "正如您在上面看到的,每个词例都被一个整数替换了。您可以通过在该层上调用 `.get_vocabulary()` 来查找每个整数对应的词例(字符串)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kRq9hTQzhVhW" + }, + "outputs": [], + "source": [ + "print(\"1287 ---> \",vectorize_layer.get_vocabulary()[1287])\n", + "print(\" 313 ---> \",vectorize_layer.get_vocabulary()[313])\n", + "print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XD2H6utRydGv" + }, + "source": [ + "你几乎已经准备好训练你的模型了。作为最后的预处理步骤,你将在训练、验证和测试数据集上应用之前创建的TextVectorization层。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2zhmpeViI1iG" + }, + "outputs": [], + "source": [ + "train_ds = raw_train_ds.map(vectorize_text)\n", + "val_ds = raw_val_ds.map(vectorize_text)\n", + "test_ds = raw_test_ds.map(vectorize_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsVQyPMizjuO" + }, + "source": [ + "### 配置数据集以提高性能\n", + "\n", + "以下是加载数据时应该使用的两种重要方法,以确保 I/O 不会阻塞。\n", + "\n", + "从磁盘加载后,`.cache()` 会将数据保存在内存中。这将确保数据集在训练模型时不会成为瓶颈。如果您的数据集太大而无法放入内存,也可以使用此方法创建高性能的磁盘缓存,这比许多小文件的读取效率更高。\n", + "\n", + "`prefetch()` 会在训练时将数据预处理和模型执行重叠。\n", + "\n", + "您可以在[数据性能指南](https://tensorflow.google.cn/guide/data_performance)中深入了解这两种方法,以及如何将数据缓存到磁盘。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMcs_H7izm5m" + }, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.AUTOTUNE\n", + "\n", + "train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)\n", + "test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLC02j2g-llC" + }, + "source": [ + "### 创建模型\n", + "\n", + "是时候创建您的神经网络了:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkQP6in8yUBR" + }, + "outputs": [], + "source": [ + "embedding_dim = 16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xpKOoWgu-llD" + }, + "outputs": [], + "source": [ + "model = tf.keras.Sequential([\n", + " layers.Embedding(max_features + 1, embedding_dim),\n", + " layers.Dropout(0.2),\n", + " layers.GlobalAveragePooling1D(),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(1)])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6PbKQ6mucuKL" + }, + "source": [ + "层按顺序堆叠以构建分类器:\n", + "\n", + "1. 第一个层是 `Embedding` 层。此层采用整数编码的评论,并查找每个单词索引的嵌入向量。这些向量是通过模型训练学习到的。向量向输出数组增加了一个维度。得到的维度为:`(batch, sequence, embedding)`。要详细了解嵌入向量,请参阅[单词嵌入向量](https://tensorflow.google.cn/text/guide/word_embeddings)教程。\n", + "2. 接下来,`GlobalAveragePooling1D` 将通过对序列维度求平均值来为每个样本返回一个定长输出向量。这允许模型以尽可能最简单的方式处理变长输入。\n", + "3. 最后一层与单个输出结点密集连接。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L4EqVWg4-llM" + }, + "source": [ + "### 损失函数与优化器\n", + "\n", + "模型训练需要一个损失函数和一个优化器。由于这是一个二元分类问题,并且模型输出概率(具有 Sigmoid 激活的单一单元层),我们将使用 `losses.BinaryCrossentropy` 损失函数。\n", + "\n", + "现在,配置模型以使用优化器和损失函数:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mr0GP-cQ-llN" + }, + "outputs": [], + "source": [ + "model.compile(loss=losses.BinaryCrossentropy(from_logits=True),\n", + " optimizer='adam',\n", + " metrics=tf.metrics.BinaryAccuracy(threshold=0.0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "35jv_fzP-llU" + }, + "source": [ + "### 训练模型\n", + "\n", + "将 `dataset` 对象传递给 fit 方法,对模型进行训练。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tXSGrjWZ-llW" + }, + "outputs": [], + "source": [ + "epochs = 10\n", + "history = model.fit(\n", + " train_ds,\n", + " validation_data=val_ds,\n", + " epochs=epochs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EEGuDVuzb5r" + }, + "source": [ + "### 评估模型\n", + "\n", + "我们来看一下模型的性能如何。将返回两个值。损失值(loss)(一个表示误差的数字,值越低越好)与准确率(accuracy)。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMKywn4zReN" + }, + "outputs": [], + "source": [ + "loss, accuracy = model.evaluate(test_ds)\n", + "\n", + "print(\"Loss: \", loss)\n", + "print(\"Accuracy: \", accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1iEXVTR0Z2t" + }, + "source": [ + "这种十分简单的方式实现了约 86% 的准确率。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ldbQqCw2Xc1W" + }, + "source": [ + "### 创建准确率和损失随时间变化的图表\n", + "\n", + "`model.fit()` 会返回包含一个字典的 `History` 对象。该字典包含训练过程中产生的所有信息:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-YcvZsdvWfDf" + }, + "outputs": [], + "source": [ + "history_dict = history.history\n", + "history_dict.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1_CH32qJXruI" + }, + "source": [ + "其中有四个条目:每个条目代表训练和验证过程中的一项监测指标。您可以使用这些指标来绘制用于比较的训练损失和验证损失图表,以及训练准确率和验证准确率图表:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2SEMeQ5YXs8z" + }, + "outputs": [], + "source": [ + "acc = history_dict['binary_accuracy']\n", + "val_acc = history_dict['val_binary_accuracy']\n", + "loss = history_dict['loss']\n", + "val_loss = history_dict['val_loss']\n", + "\n", + "epochs = range(1, len(acc) + 1)\n", + "\n", + "# \"bo\" is for \"blue dot\"\n", + "plt.plot(epochs, loss, 'bo', label='Training loss')\n", + "# b is for \"solid blue line\"\n", + "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", + "plt.title('Training and validation loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PJemLPXwz_" + }, + "outputs": [], + "source": [ + "plt.plot(epochs, acc, 'bo', label='Training acc')\n", + "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", + "plt.title('Training and validation accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend(loc='lower right')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFFyCuJoXy7r" + }, + "source": [ + "在该图表中,虚线代表训练损失和准确率,实线代表验证损失和准确率。\n", + "\n", + "请注意,训练损失会逐周期*下降*,而训练准确率则逐周期*上升*。使用梯度下降优化时,这是预期结果,它应该在每次迭代中最大限度减少所需的数量。\n", + "\n", + "但是,对于验证损失和准确率来说则不然——它们似乎会在训练转确率之前达到顶点。这是过拟合的一个例子:模型在训练数据上的表现要好于在之前从未见过的数据上的表现。经过这一点之后,模型会过度优化和学习*特定*于训练数据的表示,但无法*泛化*到测试数据。\n", + "\n", + "对于这种特殊情况,您可以通过在验证准确率不再增加时直接停止训练来防止过度拟合。一种方式是使用 `tf.keras.callbacks.EarlyStopping` 回调。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-to23J3Vy5d3" + }, + "source": [ + "## 导出模型\n", + "\n", + "在上面的代码中,您在向模型馈送文本之前对数据集应用了 `TextVectorization`。 如果您想让模型能够处理原始字符串(例如,为了简化部署),您可以在模型中包含 `TextVectorization` 层。为此,您可以使用刚刚训练的权重创建一个新模型。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWXsMvryuZuq" + }, + "outputs": [], + "source": [ + "export_model = tf.keras.Sequential([\n", + " vectorize_layer,\n", + " model,\n", + " layers.Activation('sigmoid')\n", + "])\n", + "\n", + "export_model.compile(\n", + " loss=losses.BinaryCrossentropy(from_logits=False), optimizer=\"adam\", metrics=['accuracy']\n", + ")\n", + "\n", + "# Test it with `raw_test_ds`, which yields raw strings\n", + "loss, accuracy = export_model.evaluate(raw_test_ds)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TwQgoN88LoEF" + }, + "source": [ + "### 使用新数据进行推断\n", + "\n", + "要获得对新样本的预测,只需调用 `model.predict()` 即可。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QW355HH5L49K" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"The movie was great!\",\n", + " \"The movie was okay.\",\n", + " \"The movie was terrible...\"\n", + "]\n", + "\n", + "export_model.predict(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MaxlpFWpzR6c" + }, + "source": [ + "将文本预处理逻辑包含在模型中后,您可以导出用于生产的模型,从而简化部署并降低[训练/测试偏差](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew)的可能性。\n", + "\n", + "在选择应用 TextVectorization 层的位置时,需要注意性能差异。在模型之外使用它可以让您在 GPU 上训练时进行异步 CPU 处理和数据缓冲。因此,如果您在 GPU 上训练模型,您应该在开发模型时使用此选项以获得最佳性能,然后在准备好部署时进行切换,在模型中包含 TextVectorization 层。\n", + "\n", + "请参阅此[教程](https://tensorflow.google.cn/tutorials/keras/save_and_load),详细了解如何保存模型。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eSSuci_6nCEG" + }, + "source": [ + "## 练习:对 Stack Overflow 问题进行多类分类\n", + "\n", + "本教程展示了如何在 IMDB 数据集上从头开始训练二元分类器。作为练习,您可以修改此笔记本以训练多类分类器来预测 [Stack Overflow](http://stackoverflow.com/) 上的编程问题的标签。\n", + "\n", + "我们已经准备好了一个[数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)供您使用,其中包含了几千个发布在 Stack Overflow 上的编程问题(例如,\"How can sort a dictionary by value in Python?\")。每一个问题都只有一个标签(Python、CSharp、JavaScript 或 Java)。您的任务是将问题作为输入,并预测适当的标签,在本例中为 Python。\n", + "\n", + "您将使用的数据集包含从 [BigQuery](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) 上更大的公共 Stack Overflow 数据集提取的数千个问题,其中包含超过 1700 万个帖子。\n", + "\n", + "下载数据集后,您会发现它与您之前使用的 IMDB 数据集具有相似的目录结构:\n", + "\n", + "```\n", + "train/\n", + "...python/\n", + "......0.txt\n", + "......1.txt\n", + "...javascript/\n", + "......0.txt\n", + "......1.txt\n", + "...csharp/\n", + "......0.txt\n", + "......1.txt\n", + "...java/\n", + "......0.txt\n", + "......1.txt\n", + "```\n", + "\n", + "注:为了增加分类问题的难度,编程问题中出现的 Python、CSharp、JavaScript 或 Java 等词已被替换为 *blank*(因为许多问题都包含它们所涉及的语言)。\n", + "\n", + "要完成此练习,您应该对此笔记本进行以下修改以使用 Stack Overflow 数据集:\n", + "\n", + "1. 在笔记本顶部,将下载 IMDB 数据集的代码更新为下载前面准备好的 [Stack Overflow 数据集](https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz)的代码。由于 Stack Overflow 数据集具有类似的目录结构,因此您不需要进行太多修改。\n", + "\n", + "2. 将模型的最后一层修改为 `Dense(4)`,因为现在有四个输出类。\n", + "\n", + "3. 编译模型时,将损失更改为 `tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)`。当每个类的标签是整数(在本例中,它们可以是 0、*1*、*2* 或 *3*)时,这是用于多类分类问题的正确损失函数。 此外,将指标更改为 `metrics=['accuracy']`,因为这是一个多类分类问题(`tf.metrics.BinaryAccuracy` 仅用于二元分类器 )。\n", + "\n", + "4. 在绘制随时间变化的准确率时,请将 `binary_accuracy` 和 `val_binary_accuracy` 分别更改为 `accuracy` 和 `val_accuracy`。\n", + "\n", + "5. 完成这些更改后,就可以训练多类分类器了。 " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F0T5SIwSm7uc" + }, + "source": [ + "## 了解更多信息\n", + "\n", + "本教程从头开始介绍了文本分类。要详细了解一般的文本分类工作流程,请查看 Google Developers 提供的[文本分类指南](https://developers.google.com/machine-learning/guides/text-classification/)。\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "text_classification.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From 3187107c1f40314bc661df1440894fb2ce7b67fd Mon Sep 17 00:00:00 2001 From: niushuaibing Date: Fri, 14 Feb 2025 16:16:07 +0800 Subject: [PATCH 3/3] fix path error --- site/en-snapshot/tutorials/keras/text_classification.ipynb | 2 +- site/es-419/tutorials/keras/text_classification.ipynb | 2 +- site/ja/tutorials/keras/text_classification.ipynb | 2 +- site/ko/tutorials/keras/text_classification.ipynb | 2 +- site/pt-br/tutorials/keras/text_classification.ipynb | 2 +- site/zh-cn/tutorials/keras/text_classification.ipynb | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/site/en-snapshot/tutorials/keras/text_classification.ipynb b/site/en-snapshot/tutorials/keras/text_classification.ipynb index 4182c3f295..b0db879470 100644 --- a/site/en-snapshot/tutorials/keras/text_classification.ipynb +++ b/site/en-snapshot/tutorials/keras/text_classification.ipynb @@ -171,7 +171,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, { diff --git a/site/es-419/tutorials/keras/text_classification.ipynb b/site/es-419/tutorials/keras/text_classification.ipynb index 889018a71c..0a16f4de8c 100644 --- a/site/es-419/tutorials/keras/text_classification.ipynb +++ b/site/es-419/tutorials/keras/text_classification.ipynb @@ -163,7 +163,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, { diff --git a/site/ja/tutorials/keras/text_classification.ipynb b/site/ja/tutorials/keras/text_classification.ipynb index 4448bf8d3d..82daf7f0b4 100644 --- a/site/ja/tutorials/keras/text_classification.ipynb +++ b/site/ja/tutorials/keras/text_classification.ipynb @@ -163,7 +163,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, { diff --git a/site/ko/tutorials/keras/text_classification.ipynb b/site/ko/tutorials/keras/text_classification.ipynb index 74b14fda01..2b3702918c 100644 --- a/site/ko/tutorials/keras/text_classification.ipynb +++ b/site/ko/tutorials/keras/text_classification.ipynb @@ -163,7 +163,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, { diff --git a/site/pt-br/tutorials/keras/text_classification.ipynb b/site/pt-br/tutorials/keras/text_classification.ipynb index 259fe0d015..ee8b62ed3d 100644 --- a/site/pt-br/tutorials/keras/text_classification.ipynb +++ b/site/pt-br/tutorials/keras/text_classification.ipynb @@ -167,7 +167,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, { diff --git a/site/zh-cn/tutorials/keras/text_classification.ipynb b/site/zh-cn/tutorials/keras/text_classification.ipynb index a9beeea6ec..ae8abdca7f 100644 --- a/site/zh-cn/tutorials/keras/text_classification.ipynb +++ b/site/zh-cn/tutorials/keras/text_classification.ipynb @@ -163,7 +163,7 @@ " untar=True, cache_dir='.',\n", " cache_subdir='')\n", "\n", - "dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')" + "dataset_dir = os.path.join(os.path.basename(dataset), 'aclImdb')" ] }, {