text segunda parte editado

parent 7d66d184
......@@ -205,7 +205,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
......@@ -215,7 +215,7 @@
"\twith 18 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
......@@ -235,7 +235,43 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 1)\t1\n",
" (0, 6)\t1\n",
" (0, 2)\t1\n",
" (0, 4)\t1\n",
" (0, 5)\t1\n",
" (1, 7)\t1\n",
" (1, 1)\t1\n",
" (1, 2)\t1\n",
" (1, 4)\t1\n",
" (1, 5)\t1\n",
" (2, 8)\t1\n",
" (2, 2)\t1\n",
" (3, 3)\t1\n",
" (3, 0)\t1\n",
" (3, 6)\t1\n",
" (3, 2)\t1\n",
" (3, 4)\t1\n",
" (3, 5)\t1\n"
]
}
],
"source": [
"print(X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
......@@ -244,7 +280,7 @@
"['este', 'es', 'un', 'documento', 'analizar']"
]
},
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
......@@ -256,7 +292,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"outputs": [
{
......@@ -273,7 +309,7 @@
" 'tercero']"
]
},
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
......@@ -284,7 +320,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 9,
"metadata": {},
"outputs": [
{
......@@ -293,7 +329,7 @@
"1"
]
},
"execution_count": 12,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
......@@ -304,7 +340,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 12,
"metadata": {},
"outputs": [
{
......@@ -324,13 +360,13 @@
" 'son muy chidos']"
]
},
"execution_count": 16,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1)\n",
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=4)\n",
"analyze = bigram_vectorizer.build_analyzer()\n",
"analyze('Los ngramas son muy chidos')"
]
......@@ -344,7 +380,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"outputs": [
{
......@@ -354,7 +390,7 @@
" use_idf=True)"
]
},
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
......@@ -375,7 +411,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 14,
"metadata": {},
"outputs": [
{
......@@ -389,7 +425,7 @@
" [0.58149261, 0. , 0.81355169]])"
]
},
"execution_count": 18,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
......@@ -413,31 +449,53 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<4x9 sparse matrix of type '<class 'numpy.float64'>'\n",
"\twith 18 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Como tf-idf es muy común para representar documentos, existe la clase \n",
"# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit_transform(corpus)\n",
"Xprima = vectorizer.fit_transform(corpus)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 5)\t0.4181266243877562\n",
" (0, 4)\t0.4181266243877562\n",
" (0, 2)\t0.3418459132932508\n",
" (0, 6)\t0.5164695651831305\n",
" (0, 1)\t0.5164695651831305\n",
" (1, 5)\t0.3878225151467608\n",
" (1, 4)\t0.3878225151467608\n",
" (1, 2)\t0.3170703183040649\n",
" (1, 1)\t0.4790379614294201\n",
" (1, 7)\t0.6075989123184679\n",
" (2, 2)\t0.46263733109032296\n",
" (2, 8)\t0.8865476297873808\n",
" (3, 5)\t0.3314387711719163\n",
" (3, 4)\t0.3314387711719163\n",
" (3, 2)\t0.2709729130450805\n",
" (3, 6)\t0.4093928203750212\n",
" (3, 0)\t0.519262881857229\n",
" (3, 3)\t0.519262881857229\n"
]
}
],
"source": [
"print(Xprima)"
]
},
{
"cell_type": "markdown",
"metadata": {},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment