text segunda parte editado

parent 7d66d184
...@@ -205,7 +205,7 @@ ...@@ -205,7 +205,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -215,7 +215,7 @@ ...@@ -215,7 +215,7 @@
"\twith 18 stored elements in Compressed Sparse Row format>" "\twith 18 stored elements in Compressed Sparse Row format>"
] ]
}, },
"execution_count": 5, "execution_count": 4,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -235,7 +235,43 @@ ...@@ -235,7 +235,43 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 1)\t1\n",
" (0, 6)\t1\n",
" (0, 2)\t1\n",
" (0, 4)\t1\n",
" (0, 5)\t1\n",
" (1, 7)\t1\n",
" (1, 1)\t1\n",
" (1, 2)\t1\n",
" (1, 4)\t1\n",
" (1, 5)\t1\n",
" (2, 8)\t1\n",
" (2, 2)\t1\n",
" (3, 3)\t1\n",
" (3, 0)\t1\n",
" (3, 6)\t1\n",
" (3, 2)\t1\n",
" (3, 4)\t1\n",
" (3, 5)\t1\n"
]
}
],
"source": [
"print(X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -244,7 +280,7 @@ ...@@ -244,7 +280,7 @@
"['este', 'es', 'un', 'documento', 'analizar']" "['este', 'es', 'un', 'documento', 'analizar']"
] ]
}, },
"execution_count": 9, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -256,7 +292,7 @@ ...@@ -256,7 +292,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -273,7 +309,7 @@ ...@@ -273,7 +309,7 @@
" 'tercero']" " 'tercero']"
] ]
}, },
"execution_count": 10, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -284,7 +320,7 @@ ...@@ -284,7 +320,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -293,7 +329,7 @@ ...@@ -293,7 +329,7 @@
"1" "1"
] ]
}, },
"execution_count": 12, "execution_count": 9,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -304,7 +340,7 @@ ...@@ -304,7 +340,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -324,13 +360,13 @@ ...@@ -324,13 +360,13 @@
" 'son muy chidos']" " 'son muy chidos']"
] ]
}, },
"execution_count": 16, "execution_count": 12,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1)\n", "bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=4)\n",
"analyze = bigram_vectorizer.build_analyzer()\n", "analyze = bigram_vectorizer.build_analyzer()\n",
"analyze('Los ngramas son muy chidos')" "analyze('Los ngramas son muy chidos')"
] ]
...@@ -344,7 +380,7 @@ ...@@ -344,7 +380,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -354,7 +390,7 @@ ...@@ -354,7 +390,7 @@
" use_idf=True)" " use_idf=True)"
] ]
}, },
"execution_count": 17, "execution_count": 13,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -375,7 +411,7 @@ ...@@ -375,7 +411,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -389,7 +425,7 @@ ...@@ -389,7 +425,7 @@
" [0.58149261, 0. , 0.81355169]])" " [0.58149261, 0. , 0.81355169]])"
] ]
}, },
"execution_count": 18, "execution_count": 14,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -413,31 +449,53 @@ ...@@ -413,31 +449,53 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<4x9 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 18 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 20,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "outputs": [],
}
],
"source": [ "source": [
"# Como tf-idf es muy común para representar documentos, existe la clase \n", "# Como tf-idf es muy común para representar documentos, existe la clase \n",
"# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n", "# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
"\n", "\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer()\n", "vectorizer = TfidfVectorizer()\n",
"vectorizer.fit_transform(corpus)\n", "Xprima = vectorizer.fit_transform(corpus)\n",
" " " "
] ]
}, },
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 5)\t0.4181266243877562\n",
" (0, 4)\t0.4181266243877562\n",
" (0, 2)\t0.3418459132932508\n",
" (0, 6)\t0.5164695651831305\n",
" (0, 1)\t0.5164695651831305\n",
" (1, 5)\t0.3878225151467608\n",
" (1, 4)\t0.3878225151467608\n",
" (1, 2)\t0.3170703183040649\n",
" (1, 1)\t0.4790379614294201\n",
" (1, 7)\t0.6075989123184679\n",
" (2, 2)\t0.46263733109032296\n",
" (2, 8)\t0.8865476297873808\n",
" (3, 5)\t0.3314387711719163\n",
" (3, 4)\t0.3314387711719163\n",
" (3, 2)\t0.2709729130450805\n",
" (3, 6)\t0.4093928203750212\n",
" (3, 0)\t0.519262881857229\n",
" (3, 3)\t0.519262881857229\n"
]
}
],
"source": [
"print(Xprima)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment