text segunda parte editado

5abbbdef · Alejandro Molina Villegas · 7d66d184 · 5abbbdef
Commit 5abbbdef authored Apr 02, 2019 by Alejandro Molina Villegas
Hide whitespace changes
Inline Side-by-side

Showing with 88 additions and 30 deletions

09-TextProcessing.ipynb 09-TextProcessing.ipynb +88 -30

No files found.
--- a/09-TextProcessing.ipynb
+++ b/09-TextProcessing.ipynb
@@ -205,7 +205,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -215,7 +215,7 @@
       "\twith 18 stored elements in Compressed Sparse Row format>"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -235,7 +235,43 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  (0, 1)\t1\n",
+      "  (0, 6)\t1\n",
+      "  (0, 2)\t1\n",
+      "  (0, 4)\t1\n",
+      "  (0, 5)\t1\n",
+      "  (1, 7)\t1\n",
+      "  (1, 1)\t1\n",
+      "  (1, 2)\t1\n",
+      "  (1, 4)\t1\n",
+      "  (1, 5)\t1\n",
+      "  (2, 8)\t1\n",
+      "  (2, 2)\t1\n",
+      "  (3, 3)\t1\n",
+      "  (3, 0)\t1\n",
+      "  (3, 6)\t1\n",
+      "  (3, 2)\t1\n",
+      "  (3, 4)\t1\n",
+      "  (3, 5)\t1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@@ -244,7 +280,7 @@
       "['este', 'es', 'un', 'documento', 'analizar']"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -256,7 +292,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@@ -273,7 +309,7 @@
       " 'tercero']"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -284,7 +320,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -293,7 +329,7 @@
       "1"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -304,7 +340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -324,13 +360,13 @@
       " 'son muy chidos']"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1)\n",
+    "bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=4)\n",
    "analyze = bigram_vectorizer.build_analyzer()\n",
    "analyze('Los ngramas son muy chidos')"
   ]
@@ -344,7 +380,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@@ -354,7 +390,7 @@
       "         use_idf=True)"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -375,7 +411,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@@ -389,7 +425,7 @@
       "       [0.58149261, 0.        , 0.81355169]])"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -413,31 +449,53 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<4x9 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 18 stored elements in Compressed Sparse Row format>"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Como tf-idf es muy común para representar documentos, existe la clase \n",
    "# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer()\n",
-    "vectorizer.fit_transform(corpus)\n",
+    "Xprima = vectorizer.fit_transform(corpus)\n",
    "                               "
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  (0, 5)\t0.4181266243877562\n",
+      "  (0, 4)\t0.4181266243877562\n",
+      "  (0, 2)\t0.3418459132932508\n",
+      "  (0, 6)\t0.5164695651831305\n",
+      "  (0, 1)\t0.5164695651831305\n",
+      "  (1, 5)\t0.3878225151467608\n",
+      "  (1, 4)\t0.3878225151467608\n",
+      "  (1, 2)\t0.3170703183040649\n",
+      "  (1, 1)\t0.4790379614294201\n",
+      "  (1, 7)\t0.6075989123184679\n",
+      "  (2, 2)\t0.46263733109032296\n",
+      "  (2, 8)\t0.8865476297873808\n",
+      "  (3, 5)\t0.3314387711719163\n",
+      "  (3, 4)\t0.3314387711719163\n",
+      "  (3, 2)\t0.2709729130450805\n",
+      "  (3, 6)\t0.4093928203750212\n",
+      "  (3, 0)\t0.519262881857229\n",
+      "  (3, 3)\t0.519262881857229\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(Xprima)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},