text segunda parte

7d66d184 · Alejandro Molina Villegas · bea5d78a · 7d66d184
Commit 7d66d184 authored Apr 02, 2019 by Alejandro Molina Villegas
Hide whitespace changes
Inline Side-by-side

Showing with 1156 additions and 7 deletions

09-TextProcessing.ipynb 09-TextProcessing.ipynb +1156 -7

No files found.
--- a/09-TextProcessing.ipynb
+++ b/09-TextProcessing.ipynb
@@ -30,13 +30,6 @@
    "        - Crowdsourcing\n"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -131,6 +124,1162 @@
    "* Sea cuidadoso ya que el sitio podría banear su IP en caso de que detecte un ataque"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 9. Procesamiento de Lenguaje Natural parte 2\n",
+    "\n",
+    "\n",
+    "## Contenido de la Presentación\n",
+    "\n",
+    "\n",
+    "https://docs.google.com/presentation/d/1WCVA9bMu12rfQDSg5guPXg6FkLgKmRsUM9aptMJ4Z1s/edit?usp=sharing\n",
+    "\n",
+    "\n",
+    "        \n",
+    "#### 9.4 Codificación textual:\n",
+    "\n",
+    "    - Bolsa de palabras\n",
+    "    - Modelo Vectorial\n",
+    "\n",
+    "\n",
+    "#### 9.5 Similitud Textual con producto coseno\n",
+    "\n",
+    "\n",
+    "#### 9.6 Técnicas de Análisis de Textos con Aprendizaje Automático en estudios de caso:\n",
+    "\n",
+    "\n",
+    "    - Clasificación\n",
+    "    - Agrupamiento (Clustering)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modelo vectorial en sklearn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
+       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
+       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
+       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
+       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
+       "        tokenizer=None, vocabulary=None)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorizer = CountVectorizer()\n",
+    "vectorizer "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<4x9 sparse matrix of type '<class 'numpy.int64'>'\n",
+       "\twith 18 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# ejemplo de un modelo vectorial minimalista\n",
+    "\n",
+    "corpus = [\n",
+    "    'Este es el primer documento.',\n",
+    "    'Este es el segundo documento.',\n",
+    "    'Y el tercero.',\n",
+    "    'Acaso este es el primer elemento?',\n",
+    "]\n",
+    "X = vectorizer.fit_transform(corpus)\n",
+    "X                              \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['este', 'es', 'un', 'documento', 'analizar']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analyze = vectorizer.build_analyzer()\n",
+    "analyze(\"Este es un documento a analizar.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['acaso',\n",
+       " 'documento',\n",
+       " 'el',\n",
+       " 'elemento',\n",
+       " 'es',\n",
+       " 'este',\n",
+       " 'primer',\n",
+       " 'segundo',\n",
+       " 'tercero']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorizer.get_feature_names()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vectorizer.vocabulary_.get('documento')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['los',\n",
+       " 'ngramas',\n",
+       " 'son',\n",
+       " 'muy',\n",
+       " 'chidos',\n",
+       " 'los ngramas',\n",
+       " 'ngramas son',\n",
+       " 'son muy',\n",
+       " 'muy chidos',\n",
+       " 'los ngramas son',\n",
+       " 'ngramas son muy',\n",
+       " 'son muy chidos']"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1)\n",
+    "analyze = bigram_vectorizer.build_analyzer()\n",
+    "analyze('Los ngramas son muy chidos')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modelo vectorial tf-idf "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,\n",
+       "         use_idf=True)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "transformer = TfidfTransformer(smooth_idf=False)\n",
+    "transformer   \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Ejemplo usando conteos de palabras. La primera palabra está presente en el 100% de los documentos y por lo tanto, consideramos no muy importante."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0.81940995, 0.        , 0.57320793],\n",
+       "       [1.        , 0.        , 0.        ],\n",
+       "       [1.        , 0.        , 0.        ],\n",
+       "       [1.        , 0.        , 0.        ],\n",
+       "       [0.47330339, 0.88089948, 0.        ],\n",
+       "       [0.58149261, 0.        , 0.81355169]])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "counts = [[3, 0, 1],\n",
+    "          [2, 0, 0],\n",
+    "          [3, 0, 0],\n",
+    "          [4, 0, 0],\n",
+    "          [3, 2, 0],\n",
+    "          [3, 0, 2]]\n",
+    "\n",
+    "tfidf = transformer.fit_transform(counts)\n",
+    "tfidf                         \n",
+    "\n",
+    "\n",
+    "\n",
+    "tfidf.toarray()                        \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<4x9 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 18 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Como tf-idf es muy común para representar documentos, existe la clase \n",
+    "# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "vectorizer.fit_transform(corpus)\n",
+    "                               "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ejemplo completo Clasificación de 20 newsgroups dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading 20news dataset. This may take a few minutes.\n",
+      "2019-04-01 20:56:53,543 INFO Downloading 20news dataset. This may take a few minutes.\n",
+      "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n",
+      "2019-04-01 20:56:53,548 INFO Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Automatically created module for IPython interactive environment\n",
+      "Usage: ipykernel_launcher.py [options]\n",
+      "\n",
+      "Options:\n",
+      "  -h, --help            show this help message and exit\n",
+      "  --report              Print a detailed classification report.\n",
+      "  --chi2_select=SELECT_CHI2\n",
+      "                        Select some number of features using a chi-squared\n",
+      "                        test\n",
+      "  --confusion_matrix    Print the confusion matrix.\n",
+      "  --top10               Print ten most discriminative terms per class for\n",
+      "                        every classifier.\n",
+      "  --all_categories      Whether to use all categories or not.\n",
+      "  --use_hashing         Use a hashing vectorizer.\n",
+      "  --n_features=N_FEATURES\n",
+      "                        n_features when using the hashing vectorizer.\n",
+      "  --filtered            Remove newsgroup information that is easily overfit:\n",
+      "                        headers, signatures, and quoting.\n",
+      "\n",
+      "Loading 20 newsgroups dataset for categories:\n",
+      "['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
+      "data loaded\n",
+      "2034 documents - 3.980MB (training set)\n",
+      "1353 documents - 2.867MB (test set)\n",
+      "4 categories\n",
+      "\n",
+      "Extracting features from the training data using a sparse vectorizer\n",
+      "done in 0.737494s at 5.396MB/s\n",
+      "n_samples: 2034, n_features: 33809\n",
+      "\n",
+      "Extracting features from the test data using the same vectorizer\n",
+      "done in 0.422445s at 6.788MB/s\n",
+      "n_samples: 1353, n_features: 33809\n",
+      "\n",
+      "================================================================================\n",
+      "Ridge Classifier\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,\n",
+      "        max_iter=None, normalize=False, random_state=None, solver='sag',\n",
+      "        tol=0.01)\n",
+      "train time: 0.235s\n",
+      "test time:  0.006s\n",
+      "accuracy:   0.896\n",
+      "dimensionality: 33809\n",
+      "density: 1.000000\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "Perceptron\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,\n",
+      "      fit_intercept=True, max_iter=50, n_iter=None, n_iter_no_change=5,\n",
+      "      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001,\n",
+      "      validation_fraction=0.1, verbose=0, warm_start=False)\n",
+      "train time: 0.027s\n",
+      "test time:  0.003s\n",
+      "accuracy:   0.888\n",
+      "dimensionality: 33809\n",
+      "density: 0.240114\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "Passive-Aggressive\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,\n",
+      "              early_stopping=False, fit_intercept=True, loss='hinge',\n",
+      "              max_iter=50, n_iter=None, n_iter_no_change=5, n_jobs=None,\n",
+      "              random_state=None, shuffle=True, tol=0.001,\n",
+      "              validation_fraction=0.1, verbose=0, warm_start=False)\n",
+      "train time: 0.031s\n",
+      "test time:  0.003s\n",
+      "accuracy:   0.905\n",
+      "dimensionality: 33809\n",
+      "density: 0.716584\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "kNN\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
+      "           metric_params=None, n_jobs=None, n_neighbors=10, p=2,\n",
+      "           weights='uniform')\n",
+      "train time: 0.005s\n",
+      "test time:  0.260s\n",
+      "accuracy:   0.858\n",
+      "\n",
+      "================================================================================\n",
+      "Random forest\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
+      "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
+      "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
+      "            min_samples_leaf=1, min_samples_split=2,\n",
+      "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,\n",
+      "            oob_score=False, random_state=None, verbose=0,\n",
+      "            warm_start=False)\n",
+      "train time: 1.827s\n",
+      "test time:  0.154s\n",
+      "accuracy:   0.827\n",
+      "\n",
+      "================================================================================\n",
+      "L2 penalty\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+      "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
+      "     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,\n",
+      "     verbose=0)\n",
+      "train time: 0.209s\n",
+      "test time:  0.002s\n",
+      "accuracy:   0.900\n",
+      "dimensionality: 33809\n",
+      "density: 1.000000\n",
+      "\n",
+      "\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
+      "       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
+      "       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
+      "       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n",
+      "       power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
+      "       validation_fraction=0.1, verbose=0, warm_start=False)\n",
+      "train time: 0.171s\n",
+      "test time:  0.002s\n",
+      "accuracy:   0.903\n",
+      "dimensionality: 33809\n",
+      "density: 0.664172\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "L1 penalty\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+      "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
+      "     multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
+      "     verbose=0)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:183: FutureWarning: max_iter and tol parameters have been added in SGDClassifier in 0.19. If max_iter is set but tol is left unset, the default value for tol in 0.19 and 0.20 will be None (which is equivalent to -infinity, so it has no effect) but will change in 0.21 to 1e-3. Specify tol to silence this warning.\n",
+      "  FutureWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train time: 0.374s\n",
+      "test time:  0.004s\n",
+      "accuracy:   0.873\n",
+      "dimensionality: 33809\n",
+      "density: 0.005561\n",
+      "\n",
+      "\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
+      "       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
+      "       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
+      "       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',\n",
+      "       power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
+      "       validation_fraction=0.1, verbose=0, warm_start=False)\n",
+      "train time: 0.487s\n",
+      "test time:  0.002s\n",
+      "accuracy:   0.882\n",
+      "dimensionality: 33809\n",
+      "density: 0.020387\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "Elastic-Net penalty\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
+      "       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
+      "       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
+      "       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',\n",
+      "       power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
+      "       validation_fraction=0.1, verbose=0, warm_start=False)\n",
+      "train time: 0.625s\n",
+      "test time:  0.004s\n",
+      "accuracy:   0.899\n",
+      "dimensionality: 33809\n",
+      "density: 0.188648\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "NearestCentroid (aka Rocchio classifier)\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "NearestCentroid(metric='euclidean', shrink_threshold=None)\n",
+      "train time: 0.020s\n",
+      "test time:  0.005s\n",
+      "accuracy:   0.855\n",
+      "\n",
+      "================================================================================\n",
+      "Naive Bayes\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)\n",
+      "train time: 0.011s\n",
+      "test time:  0.002s\n",
+      "accuracy:   0.899\n",
+      "dimensionality: 33809\n",
+      "density: 1.000000\n",
+      "\n",
+      "\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)\n",
+      "train time: 0.014s\n",
+      "test time:  0.012s\n",
+      "accuracy:   0.884\n",
+      "dimensionality: 33809\n",
+      "density: 1.000000\n",
+      "\n",
+      "\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False)\n",
+      "train time: 0.012s\n",
+      "test time:  0.002s\n",
+      "accuracy:   0.911\n",
+      "dimensionality: 33809\n",
+      "density: 1.000000\n",
+      "\n",
+      "\n",
+      "================================================================================\n",
+      "LinearSVC with L1-based feature selection\n",
+      "________________________________________________________________________________\n",
+      "Training: \n",
+      "Pipeline(memory=None,\n",
+      "     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+      "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
+      "     multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
+      "     verbose=0),\n",
+      "        max_features=None, no...ax_iter=1000,\n",
+      "     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
+      "     verbose=0))])\n",
+      "train time: 0.340s\n",
+      "test time:  0.005s\n",
+      "accuracy:   0.880\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
+    "#         Olivier Grisel <olivier.grisel@ensta.org>\n",
+    "#         Mathieu Blondel <mathieu@mblondel.org>\n",
+    "#         Lars Buitinck\n",
+    "# License: BSD 3 clause\n",
+    "\n",
+    "from __future__ import print_function\n",
+    "\n",
+    "import logging\n",
+    "import numpy as np\n",
+    "from optparse import OptionParser\n",
+    "import sys\n",
+    "from time import time\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.feature_extraction.text import HashingVectorizer\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.feature_selection import SelectKBest, chi2\n",
+    "from sklearn.linear_model import RidgeClassifier\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.linear_model import Perceptron\n",
+    "from sklearn.linear_model import PassiveAggressiveClassifier\n",
+    "from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.neighbors import NearestCentroid\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.utils.extmath import density\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "\n",
+    "# Display progress logs on stdout\n",
+    "logging.basicConfig(level=logging.INFO,\n",
+    "                    format='%(asctime)s %(levelname)s %(message)s')\n",
+    "\n",
+    "\n",
+    "# parse commandline arguments\n",
+    "op = OptionParser()\n",
+    "op.add_option(\"--report\",\n",
+    "              action=\"store_true\", dest=\"print_report\",\n",
+    "              help=\"Print a detailed classification report.\")\n",
+    "op.add_option(\"--chi2_select\",\n",
+    "              action=\"store\", type=\"int\", dest=\"select_chi2\",\n",
+    "              help=\"Select some number of features using a chi-squared test\")\n",
+    "op.add_option(\"--confusion_matrix\",\n",
+    "              action=\"store_true\", dest=\"print_cm\",\n",
+    "              help=\"Print the confusion matrix.\")\n",
+    "op.add_option(\"--top10\",\n",
+    "              action=\"store_true\", dest=\"print_top10\",\n",
+    "              help=\"Print ten most discriminative terms per class\"\n",
+    "                   \" for every classifier.\")\n",
+    "op.add_option(\"--all_categories\",\n",
+    "              action=\"store_true\", dest=\"all_categories\",\n",
+    "              help=\"Whether to use all categories or not.\")\n",
+    "op.add_option(\"--use_hashing\",\n",
+    "              action=\"store_true\",\n",
+    "              help=\"Use a hashing vectorizer.\")\n",
+    "op.add_option(\"--n_features\",\n",
+    "              action=\"store\", type=int, default=2 ** 16,\n",
+    "              help=\"n_features when using the hashing vectorizer.\")\n",
+    "op.add_option(\"--filtered\",\n",
+    "              action=\"store_true\",\n",
+    "              help=\"Remove newsgroup information that is easily overfit: \"\n",
+    "                   \"headers, signatures, and quoting.\")\n",
+    "\n",
+    "\n",
+    "def is_interactive():\n",
+    "    return not hasattr(sys.modules['__main__'], '__file__')\n",
+    "\n",
+    "\n",
+    "# work-around for Jupyter notebook and IPython console\n",
+    "argv = [] if is_interactive() else sys.argv[1:]\n",
+    "(opts, args) = op.parse_args(argv)\n",
+    "if len(args) > 0:\n",
+    "    op.error(\"this script takes no arguments.\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "print(__doc__)\n",
+    "op.print_help()\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Load some categories from the training set\n",
+    "if opts.all_categories:\n",
+    "    categories = None\n",
+    "else:\n",
+    "    categories = [\n",
+    "        'alt.atheism',\n",
+    "        'talk.religion.misc',\n",
+    "        'comp.graphics',\n",
+    "        'sci.space',\n",
+    "    ]\n",
+    "\n",
+    "if opts.filtered:\n",
+    "    remove = ('headers', 'footers', 'quotes')\n",
+    "else:\n",
+    "    remove = ()\n",
+    "\n",
+    "print(\"Loading 20 newsgroups dataset for categories:\")\n",
+    "print(categories if categories else \"all\")\n",
+    "\n",
+    "data_train = fetch_20newsgroups(subset='train', categories=categories,\n",
+    "                                shuffle=True, random_state=42,\n",
+    "                                remove=remove)\n",
+    "\n",
+    "data_test = fetch_20newsgroups(subset='test', categories=categories,\n",
+    "                               shuffle=True, random_state=42,\n",
+    "                               remove=remove)\n",
+    "print('data loaded')\n",
+    "\n",
+    "# order of labels in `target_names` can be different from `categories`\n",
+    "target_names = data_train.target_names\n",
+    "\n",
+    "\n",
+    "def size_mb(docs):\n",
+    "    return sum(len(s.encode('utf-8')) for s in docs) / 1e6\n",
+    "\n",
+    "\n",
+    "data_train_size_mb = size_mb(data_train.data)\n",
+    "data_test_size_mb = size_mb(data_test.data)\n",
+    "\n",
+    "print(\"%d documents - %0.3fMB (training set)\" % (\n",
+    "    len(data_train.data), data_train_size_mb))\n",
+    "print(\"%d documents - %0.3fMB (test set)\" % (\n",
+    "    len(data_test.data), data_test_size_mb))\n",
+    "print(\"%d categories\" % len(target_names))\n",
+    "print()\n",
+    "\n",
+    "# split a training set and a test set\n",
+    "y_train, y_test = data_train.target, data_test.target\n",
+    "\n",
+    "print(\"Extracting features from the training data using a sparse vectorizer\")\n",
+    "t0 = time()\n",
+    "if opts.use_hashing:\n",
+    "    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,\n",
+    "                                   n_features=opts.n_features)\n",
+    "    X_train = vectorizer.transform(data_train.data)\n",
+    "else:\n",
+    "    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n",
+    "                                 stop_words='english')\n",
+    "    X_train = vectorizer.fit_transform(data_train.data)\n",
+    "duration = time() - t0\n",
+    "print(\"done in %fs at %0.3fMB/s\" % (duration, data_train_size_mb / duration))\n",
+    "print(\"n_samples: %d, n_features: %d\" % X_train.shape)\n",
+    "print()\n",
+    "\n",
+    "print(\"Extracting features from the test data using the same vectorizer\")\n",
+    "t0 = time()\n",
+    "X_test = vectorizer.transform(data_test.data)\n",
+    "duration = time() - t0\n",
+    "print(\"done in %fs at %0.3fMB/s\" % (duration, data_test_size_mb / duration))\n",
+    "print(\"n_samples: %d, n_features: %d\" % X_test.shape)\n",
+    "print()\n",
+    "\n",
+    "# mapping from integer feature name to original token string\n",
+    "if opts.use_hashing:\n",
+    "    feature_names = None\n",
+    "else:\n",
+    "    feature_names = vectorizer.get_feature_names()\n",
+    "\n",
+    "if opts.select_chi2:\n",
+    "    print(\"Extracting %d best features by a chi-squared test\" %\n",
+    "          opts.select_chi2)\n",
+    "    t0 = time()\n",
+    "    ch2 = SelectKBest(chi2, k=opts.select_chi2)\n",
+    "    X_train = ch2.fit_transform(X_train, y_train)\n",
+    "    X_test = ch2.transform(X_test)\n",
+    "    if feature_names:\n",
+    "        # keep selected feature names\n",
+    "        feature_names = [feature_names[i] for i\n",
+    "                         in ch2.get_support(indices=True)]\n",
+    "    print(\"done in %fs\" % (time() - t0))\n",
+    "    print()\n",
+    "\n",
+    "if feature_names:\n",
+    "    feature_names = np.asarray(feature_names)\n",
+    "\n",
+    "\n",
+    "def trim(s):\n",
+    "    \"\"\"Trim string to fit on terminal (assuming 80-column display)\"\"\"\n",
+    "    return s if len(s) <= 80 else s[:77] + \"...\"\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Benchmark classifiers\n",
+    "def benchmark(clf):\n",
+    "    print('_' * 80)\n",
+    "    print(\"Training: \")\n",
+    "    print(clf)\n",
+    "    t0 = time()\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    train_time = time() - t0\n",
+    "    print(\"train time: %0.3fs\" % train_time)\n",
+    "\n",
+    "    t0 = time()\n",
+    "    pred = clf.predict(X_test)\n",
+    "    test_time = time() - t0\n",
+    "    print(\"test time:  %0.3fs\" % test_time)\n",
+    "\n",
+    "    score = metrics.accuracy_score(y_test, pred)\n",
+    "    print(\"accuracy:   %0.3f\" % score)\n",
+    "\n",
+    "    if hasattr(clf, 'coef_'):\n",
+    "        print(\"dimensionality: %d\" % clf.coef_.shape[1])\n",
+    "        print(\"density: %f\" % density(clf.coef_))\n",
+    "\n",
+    "        if opts.print_top10 and feature_names is not None:\n",
+    "            print(\"top 10 keywords per class:\")\n",
+    "            for i, label in enumerate(target_names):\n",
+    "                top10 = np.argsort(clf.coef_[i])[-10:]\n",
+    "                print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n",
+    "        print()\n",
+    "\n",
+    "    if opts.print_report:\n",
+    "        print(\"classification report:\")\n",
+    "        print(metrics.classification_report(y_test, pred,\n",
+    "                                            target_names=target_names))\n",
+    "\n",
+    "    if opts.print_cm:\n",
+    "        print(\"confusion matrix:\")\n",
+    "        print(metrics.confusion_matrix(y_test, pred))\n",
+    "\n",
+    "    print()\n",
+    "    clf_descr = str(clf).split('(')[0]\n",
+    "    return clf_descr, score, train_time, test_time\n",
+    "\n",
+    "\n",
+    "results = []\n",
+    "for clf, name in (\n",
+    "        (RidgeClassifier(tol=1e-2, solver=\"sag\"), \"Ridge Classifier\"),\n",
+    "        (Perceptron(max_iter=50, tol=1e-3), \"Perceptron\"),\n",
+    "        (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),\n",
+    "         \"Passive-Aggressive\"),\n",
+    "        (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n",
+    "        (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n",
+    "    print('=' * 80)\n",
+    "    print(name)\n",
+    "    results.append(benchmark(clf))\n",
+    "\n",
+    "for penalty in [\"l2\", \"l1\"]:\n",
+    "    print('=' * 80)\n",
+    "    print(\"%s penalty\" % penalty.upper())\n",
+    "    # Train Liblinear model\n",
+    "    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n",
+    "                                       tol=1e-3)))\n",
+    "\n",
+    "    # Train SGD model\n",
+    "    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
+    "                                           penalty=penalty)))\n",
+    "\n",
+    "# Train SGD with Elastic Net penalty\n",
+    "print('=' * 80)\n",
+    "print(\"Elastic-Net penalty\")\n",
+    "results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
+    "                                       penalty=\"elasticnet\")))\n",
+    "\n",
+    "# Train NearestCentroid without threshold\n",
+    "print('=' * 80)\n",
+    "print(\"NearestCentroid (aka Rocchio classifier)\")\n",
+    "results.append(benchmark(NearestCentroid()))\n",
+    "\n",
+    "# Train sparse Naive Bayes classifiers\n",
+    "print('=' * 80)\n",
+    "print(\"Naive Bayes\")\n",
+    "results.append(benchmark(MultinomialNB(alpha=.01)))\n",
+    "results.append(benchmark(BernoulliNB(alpha=.01)))\n",
+    "results.append(benchmark(ComplementNB(alpha=.1)))\n",
+    "\n",
+    "print('=' * 80)\n",
+    "print(\"LinearSVC with L1-based feature selection\")\n",
+    "# The smaller C, the stronger the regularization.\n",
+    "# The more regularization, the more sparsity.\n",
+    "results.append(benchmark(Pipeline([\n",
+    "  ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n",
+    "                                                  tol=1e-3))),\n",
+    "  ('classification', LinearSVC(penalty=\"l2\"))])))\n",
+    "\n",
+    "# make some plots\n",
+    "\n",
+    "indices = np.arange(len(results))\n",
+    "\n",
+    "results = [[x[i] for x in results] for i in range(4)]\n",
+    "\n",
+    "clf_names, score, training_time, test_time = results\n",
+    "training_time = np.array(training_time) / np.max(training_time)\n",
+    "test_time = np.array(test_time) / np.max(test_time)\n",
+    "\n",
+    "plt.figure(figsize=(12, 8))\n",
+    "plt.title(\"Score\")\n",
+    "plt.barh(indices, score, .2, label=\"score\", color='navy')\n",
+    "plt.barh(indices + .3, training_time, .2, label=\"training time\",\n",
+    "         color='c')\n",
+    "plt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\n",
+    "plt.yticks(())\n",
+    "plt.legend(loc='best')\n",
+    "plt.subplots_adjust(left=.25)\n",
+    "plt.subplots_adjust(top=.95)\n",
+    "plt.subplots_adjust(bottom=.05)\n",
+    "\n",
+    "for i, c in zip(indices, clf_names):\n",
+    "    plt.text(-.3, i, c)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ejemplo completo Clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Automatically created module for IPython interactive environment\n",
+      "Usage: ipykernel_launcher.py [options]\n",
+      "\n",
+      "Options:\n",
+      "  -h, --help            show this help message and exit\n",
+      "  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.\n",
+      "  --no-minibatch        Use ordinary k-means algorithm (in batch mode).\n",
+      "  --no-idf              Disable Inverse Document Frequency feature weighting.\n",
+      "  --use-hashing         Use a hashing feature vectorizer\n",
+      "  --n-features=N_FEATURES\n",
+      "                        Maximum number of features (dimensions) to extract\n",
+      "                        from text.\n",
+      "  --verbose             Print progress reports inside k-means algorithm.\n",
+      "Loading 20 newsgroups dataset for categories:\n",
+      "['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
+      "3387 documents\n",
+      "4 categories\n",
+      "\n",
+      "Extracting features from the training dataset using a sparse vectorizer\n",
+      "done in 1.281258s\n",
+      "n_samples: 3387, n_features: 10000\n",
+      "\n",
+      "Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',\n",
+      "        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,\n",
+      "        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,\n",
+      "        verbose=False)\n",
+      "done in 0.094s\n",
+      "\n",
+      "Homogeneity: 0.596\n",
+      "Completeness: 0.651\n",
+      "V-measure: 0.623\n",
+      "Adjusted Rand-Index: 0.569\n",
+      "Silhouette Coefficient: 0.008\n",
+      "\n",
+      "Top terms per cluster:\n",
+      "Cluster 0: graphics image university thanks com files file 3d ac posting\n",
+      "Cluster 1: access digex henry pat toronto net com hst prb zoo\n",
+      "Cluster 2: space nasa gov alaska moon launch com shuttle just like\n",
+      "Cluster 3: god com people sandvik article don jesus say keith christian\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
+    "#         Lars Buitinck\n",
+    "# License: BSD 3 clause\n",
+    "\n",
+    "from __future__ import print_function\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.decomposition import TruncatedSVD\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.feature_extraction.text import HashingVectorizer\n",
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.preprocessing import Normalizer\n",
+    "from sklearn import metrics\n",
+    "\n",
+    "from sklearn.cluster import KMeans, MiniBatchKMeans\n",
+    "\n",
+    "import logging\n",
+    "from optparse import OptionParser\n",
+    "import sys\n",
+    "from time import time\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "# Display progress logs on stdout\n",
+    "logging.basicConfig(level=logging.INFO,\n",
+    "                    format='%(asctime)s %(levelname)s %(message)s')\n",
+    "\n",
+    "# parse commandline arguments\n",
+    "op = OptionParser()\n",
+    "op.add_option(\"--lsa\",\n",
+    "              dest=\"n_components\", type=\"int\",\n",
+    "              help=\"Preprocess documents with latent semantic analysis.\")\n",
+    "op.add_option(\"--no-minibatch\",\n",
+    "              action=\"store_false\", dest=\"minibatch\", default=True,\n",
+    "              help=\"Use ordinary k-means algorithm (in batch mode).\")\n",
+    "op.add_option(\"--no-idf\",\n",
+    "              action=\"store_false\", dest=\"use_idf\", default=True,\n",
+    "              help=\"Disable Inverse Document Frequency feature weighting.\")\n",
+    "op.add_option(\"--use-hashing\",\n",
+    "              action=\"store_true\", default=False,\n",
+    "              help=\"Use a hashing feature vectorizer\")\n",
+    "op.add_option(\"--n-features\", type=int, default=10000,\n",
+    "              help=\"Maximum number of features (dimensions)\"\n",
+    "                   \" to extract from text.\")\n",
+    "op.add_option(\"--verbose\",\n",
+    "              action=\"store_true\", dest=\"verbose\", default=False,\n",
+    "              help=\"Print progress reports inside k-means algorithm.\")\n",
+    "\n",
+    "print(__doc__)\n",
+    "op.print_help()\n",
+    "\n",
+    "\n",
+    "def is_interactive():\n",
+    "    return not hasattr(sys.modules['__main__'], '__file__')\n",
+    "\n",
+    "\n",
+    "# work-around for Jupyter notebook and IPython console\n",
+    "argv = [] if is_interactive() else sys.argv[1:]\n",
+    "(opts, args) = op.parse_args(argv)\n",
+    "if len(args) > 0:\n",
+    "    op.error(\"this script takes no arguments.\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Load some categories from the training set\n",
+    "categories = [\n",
+    "    'alt.atheism',\n",
+    "    'talk.religion.misc',\n",
+    "    'comp.graphics',\n",
+    "    'sci.space',\n",
+    "]\n",
+    "# Uncomment the following to do the analysis on all the categories\n",
+    "# categories = None\n",
+    "\n",
+    "print(\"Loading 20 newsgroups dataset for categories:\")\n",
+    "print(categories)\n",
+    "\n",
+    "dataset = fetch_20newsgroups(subset='all', categories=categories,\n",
+    "                             shuffle=True, random_state=42)\n",
+    "\n",
+    "print(\"%d documents\" % len(dataset.data))\n",
+    "print(\"%d categories\" % len(dataset.target_names))\n",
+    "print()\n",
+    "\n",
+    "labels = dataset.target\n",
+    "true_k = np.unique(labels).shape[0]\n",
+    "\n",
+    "print(\"Extracting features from the training dataset \"\n",
+    "      \"using a sparse vectorizer\")\n",
+    "t0 = time()\n",
+    "if opts.use_hashing:\n",
+    "    if opts.use_idf:\n",
+    "        # Perform an IDF normalization on the output of HashingVectorizer\n",
+    "        hasher = HashingVectorizer(n_features=opts.n_features,\n",
+    "                                   stop_words='english', alternate_sign=False,\n",
+    "                                   norm=None, binary=False)\n",
+    "        vectorizer = make_pipeline(hasher, TfidfTransformer())\n",
+    "    else:\n",
+    "        vectorizer = HashingVectorizer(n_features=opts.n_features,\n",
+    "                                       stop_words='english',\n",
+    "                                       alternate_sign=False, norm='l2',\n",
+    "                                       binary=False)\n",
+    "else:\n",
+    "    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n",
+    "                                 min_df=2, stop_words='english',\n",
+    "                                 use_idf=opts.use_idf)\n",
+    "X = vectorizer.fit_transform(dataset.data)\n",
+    "\n",
+    "print(\"done in %fs\" % (time() - t0))\n",
+    "print(\"n_samples: %d, n_features: %d\" % X.shape)\n",
+    "print()\n",
+    "\n",
+    "if opts.n_components:\n",
+    "    print(\"Performing dimensionality reduction using LSA\")\n",
+    "    t0 = time()\n",
+    "    # Vectorizer results are normalized, which makes KMeans behave as\n",
+    "    # spherical k-means for better results. Since LSA/SVD results are\n",
+    "    # not normalized, we have to redo the normalization.\n",
+    "    svd = TruncatedSVD(opts.n_components)\n",
+    "    normalizer = Normalizer(copy=False)\n",
+    "    lsa = make_pipeline(svd, normalizer)\n",
+    "\n",
+    "    X = lsa.fit_transform(X)\n",
+    "\n",
+    "    print(\"done in %fs\" % (time() - t0))\n",
+    "\n",
+    "    explained_variance = svd.explained_variance_ratio_.sum()\n",
+    "    print(\"Explained variance of the SVD step: {}%\".format(\n",
+    "        int(explained_variance * 100)))\n",
+    "\n",
+    "    print()\n",
+    "\n",
+    "\n",
+    "# #############################################################################\n",
+    "# Do the actual clustering\n",
+    "\n",
+    "if opts.minibatch:\n",
+    "    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n",
+    "                         init_size=1000, batch_size=1000, verbose=opts.verbose)\n",
+    "else:\n",
+    "    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n",
+    "                verbose=opts.verbose)\n",
+    "\n",
+    "print(\"Clustering sparse data with %s\" % km)\n",
+    "t0 = time()\n",
+    "km.fit(X)\n",
+    "print(\"done in %0.3fs\" % (time() - t0))\n",
+    "print()\n",
+    "\n",
+    "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n",
+    "print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n",
+    "print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n",
+    "print(\"Adjusted Rand-Index: %.3f\"\n",
+    "      % metrics.adjusted_rand_score(labels, km.labels_))\n",
+    "print(\"Silhouette Coefficient: %0.3f\"\n",
+    "      % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n",
+    "\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "if not opts.use_hashing:\n",
+    "    print(\"Top terms per cluster:\")\n",
+    "\n",
+    "    if opts.n_components:\n",
+    "        original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n",
+    "        order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
+    "    else:\n",
+    "        order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
+    "\n",
+    "    terms = vectorizer.get_feature_names()\n",
+    "    for i in range(true_k):\n",
+    "        print(\"Cluster %d:\" % i, end='')\n",
+    "        for ind in order_centroids[i, :10]:\n",
+    "            print(' %s' % terms[ind], end='')\n",
+    "        print()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},