Actualizacion

parent 04919f65
......@@ -950,7 +950,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
......@@ -959,7 +959,7 @@
"'b=mat2\\nprint(b)\\nzip_b = zip(*b)\\nfor col_b in zip_b:\\n print(col_b)\\n'"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1018,8 +1018,8 @@
" cols_B = len(m2[0])\n",
"\n",
" if cols_A != fila_B:\n",
" print (\"Imposible resolver, la dimensionalidad de las matrices no cuadra.\")\n",
" return\n",
" print (\"Imposible resolver, la dimensionalidad de las matrices no cuadra.\")\n",
" return (0)\n",
"\n",
" C = [[0 for fila in range(cols_B)] for col in range(fila_A)]\n",
" print (C)\n",
......@@ -1055,7 +1055,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [
{
......@@ -1084,7 +1084,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
......@@ -1096,13 +1096,13 @@
"Ingrese las columnas de la matriz: \n",
"2\n",
"Ingresa el valor del elemento0,0: \n",
"2\n",
"10\n",
"Ingresa el valor del elemento0,1: \n",
"2\n",
"20\n",
"Ingresa el valor del elemento1,0: \n",
"2\n",
"30\n",
"Ingresa el valor del elemento1,1: \n",
"2\n"
"40\n"
]
}
],
......@@ -1155,7 +1155,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
......@@ -1168,10 +1168,10 @@
{
"data": {
"text/plain": [
"[[6, 6], [14, 14]]"
"[[70, 100], [150, 220]]"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
......@@ -1304,7 +1304,7 @@
" filas(): Un escalar\n",
" columnas(): Un escalar\n",
" determinante(): Un escalar\n",
" gauss()\n",
" gauss(): Regresa la matriz inversa o pseudo inversa, según el caso, por método de Gauss\n",
" \n",
" '''\n",
" def __init__ (self,matriz):\n",
......@@ -1452,25 +1452,6 @@
" return(\"Soy la matriz:\\n\"+ str(self.matriz))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' \n",
" else:\n",
" #original=self.matriz\n",
" #x=mmult1(original.T,self.matriz)\n",
" #x=Matriz(x)\n",
" #print(x)\n",
" #print(self.matriz)\n",
" #inve=x.gauss\n",
" #print(inve)\n",
" #mmult1(inve,original.T)\n",
" pass'''"
]
},
{
"cell_type": "code",
"execution_count": 1545,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -12,12 +12,12 @@
"\n",
"# 7. Machine Learning con scikit-learn\n",
"\n",
"Scikit-learn es probablemente la librería más útil para Machine Learning en Python, es de código abierto y es reutilizable en varios contextos, fomentando el uso académico y comercial. Proporciona una gama de algoritmos de aprendizaje supervisados y no supervisados en Python.\n"
"Scikit-learn es probablemente la biblioteca más útil para Machine Learning en Python, es de código abierto y es reutilizable en varios contextos, fomentando el uso académico y comercial. Proporciona una gama de algoritmos de aprendizaje supervisados y no supervisados en Python.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
......@@ -90,14 +90,29 @@
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1 2 ... 8 9 8]\n"
"<class 'sklearn.utils.Bunch'>\n",
"[[ 0. 0. 5. ... 0. 0. 0.]\n",
" [ 0. 0. 0. ... 10. 0. 0.]\n",
" [ 0. 0. 0. ... 16. 9. 0.]\n",
" ...\n",
" [ 0. 0. 1. ... 6. 0. 0.]\n",
" [ 0. 0. 2. ... 12. 0. 0.]\n",
" [ 0. 0. 10. ... 12. 1. 0.]]\n",
"[0 1 2 ... 8 9 8]\n",
"[ 0. 0. 0. 0. 14. 13. 1. 0. 0. 0. 0. 5. 16. 16. 2. 0. 0. 0.\n",
" 0. 14. 16. 12. 0. 0. 0. 1. 10. 16. 16. 12. 0. 0. 0. 3. 12. 14.\n",
" 16. 9. 0. 0. 0. 0. 0. 5. 16. 15. 0. 0. 0. 0. 0. 4. 16. 14.\n",
" 0. 0. 0. 0. 0. 1. 13. 16. 1. 0.]\n",
"1\n"
]
}
],
......@@ -106,9 +121,13 @@
"\n",
"digits = datasets.load_digits()\n",
"\n",
"#digits.data y target viene como vector\n",
"print(digits.data[0])\n",
"print(digits.target)"
"print(type(digits))\n",
"\n",
"print(digits.data)\n",
"print(digits.target)\n",
"\n",
"print(digits.data[11])\n",
"print(digits.target[11])"
]
},
{
......@@ -122,33 +141,53 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0. 0. 5. 13. 9. 1. 0. 0.]\n",
" [ 0. 0. 13. 15. 10. 15. 5. 0.]\n",
" [ 0. 3. 15. 2. 0. 11. 8. 0.]\n",
" [ 0. 4. 12. 0. 0. 8. 8. 0.]\n",
" [ 0. 5. 8. 0. 0. 9. 8. 0.]\n",
" [ 0. 4. 11. 0. 1. 12. 7. 0.]\n",
" [ 0. 2. 14. 5. 10. 12. 0. 0.]\n",
" [ 0. 0. 6. 13. 10. 0. 0. 0.]]\n",
"\n",
"\n",
"\n",
"0\n"
]
"data": {
"text/plain": [
"array([[ 0., 0., 0., 0., 14., 13., 1., 0.],\n",
" [ 0., 0., 0., 5., 16., 16., 2., 0.],\n",
" [ 0., 0., 0., 14., 16., 12., 0., 0.],\n",
" [ 0., 1., 10., 16., 16., 12., 0., 0.],\n",
" [ 0., 3., 12., 14., 16., 9., 0., 0.],\n",
" [ 0., 0., 0., 5., 16., 15., 0., 0.],\n",
" [ 0., 0., 0., 4., 16., 14., 0., 0.],\n",
" [ 0., 0., 0., 1., 13., 16., 1., 0.]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#digits.image es lo mismo que digits.data, pero en forma matricial.\n",
"print(digits.images[0])\n",
"print(\"\\n\\n\")\n",
"print(digits.target[0])"
"digits.images[11]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0., 0., 0., 0., 14., 13., 1., 0., 0., 0., 0., 5., 16.,\n",
" 16., 2., 0., 0., 0., 0., 14., 16., 12., 0., 0., 0., 1.,\n",
" 10., 16., 16., 12., 0., 0., 0., 3., 12., 14., 16., 9., 0.,\n",
" 0., 0., 0., 0., 5., 16., 15., 0., 0., 0., 0., 0., 4.,\n",
" 16., 14., 0., 0., 0., 0., 0., 1., 13., 16., 1., 0.])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"digits.data[11]\n"
]
},
{
......@@ -170,7 +209,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -181,7 +220,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 3,
"metadata": {},
"outputs": [
{
......@@ -193,7 +232,7 @@
" tol=0.001, verbose=False)"
]
},
"execution_count": 37,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
......@@ -204,16 +243,16 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([8, 9, 8])"
"array([8])"
]
},
"execution_count": 51,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
......@@ -223,17 +262,29 @@
"# In this case, you’ll predict using the last image from digits.data.\n",
"# By predicting, you’ll determine the image from the training set that best matches the last image.\n",
"\n",
"clf.predict(digits.data[-3:])"
"clf.predict(digits.data[-1:])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"execution_count": 29,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAS8AAAEyCAYAAACrlladAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADelJREFUeJzt3d+rpAd9x/H3x03EGG0WmkMI2dCTCwmI0N0wpIgS0wQlqZLtRS8SUKgUthcqu7QgsTeN/4DaiyIsSYzFaNBogoj1B5jVCjU6iWs12Vhi2CUbf+wEMTFedIl+e7ETutmePfMcc2ae89X3Cw57fgwzH5bkvc8zM2cmVYUkdfOqsQdI0u/DeElqyXhJasl4SWrJeElqyXhJasl4SWrJeElqyXhJaumCZVzppZdeWuvr68u46pZOnz499gQAfvrTn449AYCLLrpo7AkAXHbZZWNP0DmOHz/Os88+myGXXUq81tfXmU6ny7jqlo4fPz72BADuuOOOsScAsHfv3rEnAHDo0KGxJ+gck8lk8GU9bZTUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktTSoHgluSnJj5M8meT2ZY+SpEUWxivJLuBfgZuBNwK3JXnjsodJ0maGHHldCzxZVU9V1WngPmD/cmdJ0uaGxOsK4Omzvj45/97LJDmQZJpkOpvNtmufJG1o2+6wr6rDVTWpqsna2tp2Xa0kbWhIvJ4Brjzr6z3z70nSaIbE63vAG5JcleTVwK3AF5c7S5I2t/A17KvqxSTvB74K7ALurqrHlr5MkjYx6A04qurLwJeXvEWSBvMZ9pJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaMl6SWkpVbfuVTiaTmk6n2369Xa2vr489AYATJ06MPWFHueSSS8aeAMDx48fHngDA7t27x57AZDJhOp1myGU98pLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLU0sJ4Jbk7yakkP1rFIEkaYsiR1z3ATUveIUlbsjBeVfUt4Jcr2CJJg23bfV5JDiSZJpnOZrPtulpJ2tC2xauqDlfVpKoma2tr23W1krQhH22U1JLxktTSkKdKfAb4T+DqJCeT/N3yZ0nS5i5YdIGqum0VQyRpKzxtlNSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLU0sLfbezsyJEjY08A4MSJE2NPAOCjH/3o2BMAuP7668eeAMC+ffvGngDAPffcM/YEAA4dOjT2hC3xyEtSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSSwvjleTKJA8leTzJY0kOrmKYJG1myEvivAj8Y1U9muT1wCNJvl5Vjy95mySd18Ijr6r6WVU9Ov/818Ax4IplD5OkzWzpPq8k68A+4OENfnYgyTTJdDabbc86STqPwfFK8jrg88Chqnr+3J9X1eGqmlTVZG1tbTs3StL/MyheSS7kTLjuraovLHeSJC025NHGAHcBx6rqI8ufJEmLDTnyegvwHuCGJEfnH3+15F2StKmFT5Woqm8DWcEWSRrMZ9hLasl4SWrJeElqyXhJasl4SWrJeElqyXhJasl4SWrJeElqyXhJasl4SWppyMtAt/Xcc8+NPWFHOXr06NgTtIG9e/eOPaElj7wktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktbQwXklek+S7SX6Q5LEkH17FMEnazJCXxPkf4IaqeiHJhcC3k/x7VX1nydsk6bwWxquqCnhh/uWF849a5ihJWmTQfV5JdiU5CpwCvl5VD29wmQNJpkmms9lsu3dK0ssMildV/baq9gJ7gGuTvGmDyxyuqklVTdbW1rZ7pyS9zJYebayqXwEPATctZ44kDTPk0ca1JLvnn18EvB14YtnDJGkzQx5tvBz4ZJJdnIndZ6vqS8udJUmbG/Jo438B+1awRZIG8xn2kloyXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaGvKL2W3t379/7AkAPPjgg2NPAODgwYNjTwDgyJEjY0/QHwCPvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1NDheSXYl+X6SLy1zkCQNsZUjr4PAsWUNkaStGBSvJHuAdwJ3LneOJA0z9MjrY8AHgd+d7wJJDiSZJpnOZrNtGSdJ57MwXkneBZyqqkc2u1xVHa6qSVVN1tbWtm2gJG1kyJHXW4BbkhwH7gNuSPKppa6SpAUWxquqPlRVe6pqHbgV+EZVvXvpyyRpEz7PS1JLW3r3oKo6AhxZyhJJ2gKPvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktbSl323U72f//v1jTwB2zo6dIsnYEwBYX18fe0JLHnlJasl4SWrJeElqyXhJasl4SWrJeElqyXhJasl4SWrJeElqyXhJasl4SWrJeElqyXhJamnQq0okOQ78Gvgt8GJVTZY5SpIW2cpL4vxlVT27tCWStAWeNkpqaWi8CvhakkeSHNjoAkkOJJkmmc5ms+1bKEkbGBqvt1bVNcDNwPuSXHfuBarqcFVNqmqytra2rSMl6VyD4lVVz8z/PAU8AFy7zFGStMjCeCW5OMnrX/oceAfwo2UPk6TNDHm08TLggfmbFVwAfLqqvrLUVZK0wMJ4VdVTwJ+vYIskDeZTJSS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktWS8JLVkvCS1ZLwktbSVl4HW7+nIkSNjTwDg6NGjY0+Qto1HXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaMl6SWjJekloyXpJaGhSvJLuT3J/kiSTHkrx52cMkaTNDXxLnX4CvVNXfJHk18NolbpKkhRbGK8klwHXA3wJU1Wng9HJnSdLmhpw2XgXMgE8k+X6SO5NcfO6FkhxIMk0ync1m2z5Uks42JF4XANcAH6+qfcBvgNvPvVBVHa6qSVVN1tbWtnmmJL3ckHidBE5W1cPzr+/nTMwkaTQL41VVPweeTnL1/Fs3Ao8vdZUkLTD00cYPAPfOH2l8Cnjv8iZJ0mKD4lVVR4HJkrdI0mA+w15SS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUkvGS1JLxktSS8ZLUktDfzFbr8Bzzz039gQAHnzwwbEnAPDNb35z7AkAvO1tbxt7AgDr6+tjT2jJIy9JLRkvSS0ZL0ktGS9JLRkvSS0ZL0ktGS9JLRkvSS0ZL0ktGS9JLRkvSS0ZL0ktGS9JLS2MV5Krkxw96+P5JIdWMU6SzmfhS+JU1Y+BvQBJdgHPAA8seZckbWqrp403Aj+pqhPLGCNJQ201XrcCn9noB0kOJJkmmc5ms1e+TJI2MTheSV4N3AJ8bqOfV9XhqppU1WRtbW279knShrZy5HUz8GhV/WJZYyRpqK3E6zbOc8ooSas2KF5JLgbeDnxhuXMkaZhB7x5UVb8B/nTJWyRpMJ9hL6kl4yWpJeMlqSXjJakl4yWpJeMlqSXjJakl4yWpJeMlqSXjJakl4yWppVTV9l9pMgNe6autXgo8uw1zXil37KwN4I5z/SHt+LOqGvSCgEuJ13ZIMq2qiTt2zo6dsMEd7niJp42SWjJeklrayfE6PPaAOXf8n52wAdxxrj/KHTv2Pi9J2sxOPvKSpPMyXpJa2nHxSnJTkh8neTLJ7SNtuDvJqSQ/GuP2z9pxZZKHkjye5LEkB0fa8Zok303yg/mOD4+x46w9u5J8P8mXRtxwPMkPkxxNMh1xx+4k9yd5IsmxJG8eYcPV87+Hlz6eT3Jo6be7k+7zSrIL+G/OvFPRSeB7wG1V9fiKd1wHvAD8W1W9aZW3fc6Oy4HLq+rRJK8HHgH+eoS/jwAXV9ULSS4Evg0crKrvrHLHWXv+AZgAf1JV7xppw3FgUlWjPjk0ySeB/6iqO+dvDP3aqvrViHt2Ac8Af1FVr/SJ6pvaaUde1wJPVtVTVXUauA/Yv+oRVfUt4Jervt0Ndvysqh6df/5r4BhwxQg7qqpemH954fxjlH/1kuwB3gncOcbt7yRJLgGuA+4CqKrTY4Zr7kbgJ8sOF+y8eF0BPH3W1ycZ4X/WnSjJOrAPeHik29+V5ChwCvh6VY2yA/gY8EHgdyPd/ksK+FqSR5IcGGnDVcAM+MT8NPrO+XusjulWVvTm1DstXtpAktcBnwcOVdXzY2yoqt9W1V5gD3BtkpWfTid5F3Cqqh5Z9W1v4K1VdQ1wM/C++V0Nq3YBcA3w8araB/wGGOV+YoD5aestwOdWcXs7LV7PAFee9fWe+ff+aM3vY/o8cG9Vjf6O5fPTkoeAm0a4+bcAt8zvb7oPuCHJp0bYQVU9M//zFPAAZ+7yWLWTwMmzjoLv50zMxnIz8GhV/WIVN7bT4vU94A1JrppX/FbgiyNvGs38jvK7gGNV9ZERd6wl2T3//CLOPKDyxKp3VNWHqmpPVa1z5r+Nb1TVu1e9I8nF8wdQmJ+mvQNY+SPTVfVz4OkkV8+/dSOw0gdzznEbKzplhDOHnTtGVb2Y5P3AV4FdwN1V9diqdyT5DHA9cGmSk8A/V9Vdq97BmSON9wA/nN/fBPBPVfXlFe+4HPjk/JGkVwGfrarRnqawA1wGPHDm3xYuAD5dVV8ZacsHgHvn/9g/Bbx3jBHziL8d+PuV3eZOeqqEJA21004bJWkQ4yWpJeMlqSXjJakl4yWpJeMlqSXjJaml/wV3Tjqmt/l4+AAAAABJRU5ErkJggg==\n",
"text/plain": [
"array([1])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAS8AAAEyCAYAAACrlladAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADg1JREFUeJzt3f+L5IV9x/HXyz1PV3PNgQ5BXOmKRCEIURkswSBVMWgjpj/0B4UEGgrXH5KgtBBMf6n5B4L9oQQONbXEKIlGCIc1EWJIhcY4d17ql9Ng9Kp3JO5kg3gn4nLrqz/sSNfr3s5n4nzmM2/zfMByO7vDzIvzfO5nvuyMkwgAqjmt6wEA8McgXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4AStrRxoWee+65WV5ebuOiJ/Lee+91PUGSdNpp/IzY7MSJE11PkDQ/O84888yuJ0iS1tfXu56g1157Taurq25y3lbitby8rMFg0MZFT+Sdd97peoIkaXFxsesJc2V1dbXrCZKklZWVridIki6++OKuJ0iSjh071vUEXXPNNY3PyyEBgJKIF4CSiBeAkogXgJKIF4CSiBeAkogXgJKIF4CSiBeAkogXgJKIF4CSiBeAkhrFy/YNtl+y/bLtO9oeBQDjjI2X7QVJ/yrpRkmfknSr7U+1PQwAttPkyOtKSS8neSXJmqQHJX2h3VkAsL0m8Tpf0uubTh8Zfe0DbO+xPbA9GA6H09oHAFua2h32SfYm6Sfp93q9aV0sAGypSbyOSrpg0+ml0dcAoDNN4vW0pE/avtD2Tkm3SPpRu7MAYHtjX8M+yQnbX5X0Y0kLku5N8nzrywBgG43egCPJo5IebXkLADTGM+wBlES8AJREvACURLwAlES8AJREvACURLwAlES8AJREvACURLwAlES8AJTU6HcbJ7W2tqbDhw+3cdETWV5e7nrCXHnzzTe7niBJc/FvY56sr693PUGSdNddd3U9QW+88Ubj83LkBaAk4gWgJOIFoCTiBaAk4gWgJOIFoCTiBaAk4gWgJOIFoCTiBaAk4gWgJOIFoCTiBaCksfGyfa/tFdvPzWIQADTR5Mjr3yTd0PIOAJjI2Hgl+bmkP8xgCwA0NrX7vGzvsT2wPVhdXZ3WxQLAlqYWryR7k/ST9M8555xpXSwAbIlHGwGURLwAlNTkqRIPSPovSZfYPmL779qfBQDbG/vuQUluncUQAJgENxsBlES8AJREvACURLwAlES8AJREvACURLwAlES8AJREvACURLwAlES8AJQ09ncb/xg7d+7U8vJyGxddku2uJ0iSnn322a4nzJVdu3Z1PUHSxv8v8+DOO+/seoL27dvX+LwceQEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoaWy8bF9g+wnbL9h+3vZtsxgGANtp8pI4JyT9Y5IDtndJ2m/78SQvtLwNAE5p7JFXkt8mOTD6/JikQ5LOb3sYAGxnovu8bC9LulzSU1t8b4/tge3BcDiczjoAOIXG8bL9MUkPS7o9yVsnfz/J3iT9JP1erzfNjQDw/zSKl+3TtRGu+5P8sN1JADBek0cbLekeSYeSfKv9SQAwXpMjr6skfUnStbYPjj7+quVdALCtsU+VSPKkpPl4+xsAGOEZ9gBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASmryMtBlra+vdz1BkvTqq692PUGStLq62vUEYGo48gJQEvECUBLxAlAS8QJQEvECUBLxAlAS8QJQEvECUBLxAlAS8QJQEvECUBLxAlAS8QJQ0th42T7T9i9t/8r287a/OYthALCdJi+J866ka5Mct326pCdt/0eSX7S8DQBOaWy8kkTS8dHJ00cfaXMUAIzT6D4v2wu2D0pakfR4kqe2OM8e2wPbg+FwOO2dAPABjeKVZD3JZZKWJF1p+9ItzrM3ST9Jv9frTXsnAHzARI82JnlT0hOSbmhnDgA00+TRxp7t3aPPFyVdL+nFtocBwHaaPNp4nqT7bC9oI3bfT7Kv3VkAsL0mjzb+t6TLZ7AFABrjGfYASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEpq8ovZZa2trXU9QZK0vLzc9QRJ0vHjx8efaQbefffdridIks4666yuJ+BD4MgLQEnEC0BJxAtAScQLQEnEC0BJxAtAScQLQEnEC0BJxAtAScQLQEnEC0BJxAtAScQLQEmN42V7wfYztve1OQgAmpjkyOs2SYfaGgIAk2gUL9tLkj4v6e525wBAM02PvO6S9HVJ753qDLb32B7YHgyHw6mMA4BTGRsv2zdJWkmyf7vzJdmbpJ+k3+v1pjYQALbS5MjrKkk32z4s6UFJ19r+bqurAGCMsfFK8o0kS0mWJd0i6adJvtj6MgDYBs/zAlDSRO8elORnkn7WyhIAmABHXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASprodxurWVxc7HrCXFlaWup6giRp9+7dXU/ARwBHXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKavSqErYPSzomaV3SiST9NkcBwDiTvCTONUl+39oSAJgANxsBlNQ0XpH0E9v7be/Z6gy299ge2B4Mh8PpLQSALTSN12eTXCHpRklfsX31yWdIsjdJP0m/1+tNdSQAnKxRvJIcHf25IukRSVe2OQoAxhkbL9tn2971/ueSPifpubaHAcB2mjza+AlJj9h+//zfS/JYq6sAYIyx8UryiqRPz2ALADTGUyUAlES8AJREvACURLwAlES8AJREvACURLwAlES8AJREvACURLwAlES8AJQ0yctAo7gdO+bjP/f+/fu7niBJOuOMM7qeIEm69NJLu55QEkdeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEpqFC/bu20/ZPtF24dsf6btYQCwnaavkfIvkh5L8je2d0o6q8VNADDW2HjZ/rikqyX9rSQlWZO01u4sANhek5uNF0oaSvqO7Wds32377JPPZHuP7YHtwXA4nPpQANisSbx2SLpC0reTXC7pbUl3nHymJHuT9JP0e73elGcCwAc1idcRSUeSPDU6/ZA2YgYAnRkbryS/k/S67UtGX7pO0gutrgKAMZo+2vg1SfePHml8RdKX25sEAOM1ileSg5L6LW8BgMZ4hj2AkogXgJKIF4CSiBeAkogXgJKIF4CSiBeAkogXgJKIF4CSiBeAkogXgJKa/mI2PgIWFxe7niBJuuiii7qeIEnatWtX1xMkSevr611PkCQtLCx0PWEiHHkBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKIl4ASiJeAEoiXgBKGlsvGxfYvvgpo+3bN8+i3EAcCpjXxInyUuSLpMk2wuSjkp6pOVdALCtSW82XifpN0n+p40xANDUpPG6RdIDW33D9h7bA9uD4XD44ZcBwDYax8v2Tkk3S/rBVt9PsjdJP0m/1+tNax8AbGmSI68bJR1I8kZbYwCgqUnidatOcZMRAGatUbxsny3pekk/bHcOADTT6N2Dkrwt6ZyWtwBAYzzDHkBJxAtAScQLQEnEC0BJxAtAScQLQEnEC0BJxAtAScQLQEnEC0BJxAtASU4y/Qu1h5I+7Kutnivp91OY82GxY742SOw42Udpx58nafSCgK3EaxpsD5L02TE/O+ZhAzvY8T5uNgIoiXgBKGme47W36wEj7Pg/87BBYsfJ/iR3zO19XgCwnXk+8gKAUyJeAEqau3jZvsH2S7Zftn1HRxvutb1i+7kurn/TjgtsP2H7BdvP276tox1n2v6l7V+Ndnyzix2b9izYfsb2vg43HLb9rO2Dtgcd7tht+yHbL9o+ZPszHWy4ZPT38P7HW7Zvb/165+k+L9sLkn6tjXcqOiLpaUm3JnlhxjuulnRc0r8nuXSW133SjvMknZfkgO1dkvZL+usO/j4s6ewkx22fLulJSbcl+cUsd2za8w+S+pL+LMlNHW04LKmfpNMnh9q+T9J/Jrl79MbQZyV5s8M9C5KOSvqLJB/2ierbmrcjryslvZzklSRrkh6U9IVZj0jyc0l/mPX1brHjt0kOjD4/JumQpPM72JEkx0cnTx99dPJTz/aSpM9LuruL658ntj8u6WpJ90hSkrUuwzVynaTftB0uaf7idb6k1zedPqIO/medR7aXJV0u6amOrn/B9kFJK5IeT9LJDkl3Sfq6pPc6uv73RdJPbO+3vaejDRdKGkr6zuhm9N2j91jt0i2a0ZtTz1u8sAXbH5P0sKTbk7zVxYYk60kuk7Qk6UrbM785bfsmSStJ9s/6urfw2SRXSLpR0ldGdzXM2g5JV0j6dpLLJb0tqZP7iSVpdLP1Zkk/mMX1zVu8jkq6YNPppdHX/mSN7mN6WNL9STp/x/LRzZInJN3QwdVfJenm0f1ND0q61vZ3O9ihJEdHf65IekQbd3nM2hFJRzYdBT+kjZh15UZJB5K8MYsrm7d4PS3pk7YvHFX8Fkk/6nhTZ0Z3lN8j6VCSb3W4o2d79+jzRW08oPLirHck+UaSpSTL2vi38dMkX5z1Dttnjx5A0ehm2uckzfyR6SS/k/S67UtGX7pO0kwfzDnJrZrRTUZp47BzbiQ5Yfurkn4saUHSvUmen/UO2w9I+ktJ59o+Iumfk9wz6x3aONL4kqRnR/c3SdI/JXl0xjvOk3Tf6JGk0yR9P0lnT1OYA5+Q9MjGzxbtkPS9JI91tOVrku4f/bB/RdKXuxgxivj1kv5+Ztc5T0+VAICm5u1mIwA0QrwAlES8AJREvACURLwAlES8AJREvACU9L+nG0Jqc6MFTgAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 360x360 with 1 Axes>"
]
......@@ -251,16 +302,38 @@
"\n",
"#Display the first digit\n",
"plt.figure(1, figsize=(5, 5))\n",
"plt.imshow(digits.images[-2], cmap=plt.cm.gray_r, interpolation='nearest')\n",
"plt.show()"
"plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')\n",
"\n",
"gato = np.array([[ 0., 3., 0., 0., 6., 13., 1., 5., 10., 0., 0., 5., 16.,\n",
" 16., 2., 40., 0., 0., 80., 14., 16., 12., 0., 0., 0., 1.,\n",
" 10., 16., 16., 12., 0., 0., 0., 3., 0., 14., 16., 9., 0.,\n",
" 0., 0., 0., 0., 5., 0., 0., 0., 0., 0., 0., 0., 4.,\n",
" 16., 14., 0., 0., 0., 0., 0., 1., 5., 1., 1., 0.]]).reshape(8,8)\n",
"\n",
"plt.imshow( gato ,cmap=plt.cm.gray_r, interpolation='nearest')\n",
"\n",
"clf.predict(gato.reshape(1,8*8))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"digits.target[-1]"
]
},
{
"cell_type": "markdown",
......@@ -487,9 +560,18 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 270,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'numpy.ndarray'>\n",
"[[4.7 3.2 1.3 0.2]]\n",
"(1,)\n"
]
},
{
"data": {
"text/plain": [
......@@ -499,19 +581,22 @@
" tol=0.001, verbose=False)"
]
},
"execution_count": 61,
"execution_count": 270,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import svm\n",
"from sklearn import svm, datasets\n",
"\n",
"\n",
"clf = svm.SVC(gamma='scale')\n",
"iris = datasets.load_iris()\n",
"X, y = iris.data, iris.target\n",
"\n",
"print(type(X))\n",
"print(X[2:3])\n",
"print(y[0:1].shape)\n",
"clf.fit(X, y) "
]
},
......@@ -527,7 +612,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
......@@ -539,7 +624,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 14,
"metadata": {},
"outputs": [
{
......@@ -548,7 +633,7 @@
"0"
]
},
"execution_count": 63,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
......@@ -566,7 +651,7 @@
"source": [
"## 7.6 Aprendizaje no superizado: buscando estructura en los datos\n",
"\n",
"### 7.6.1 Clustering: agroupamiento de observaciones\n",
"### 7.6.1 Clustering: agrupamiento de observaciones\n",
"\n",
"Imaginemos que contamos con las observacione del dataset iris, de las cuales sabemos que hay tres tipos pero no tenemos acceso a la categorización del taxonomista que nos diga a qué tipo corresponde cada observación.\n",
"En este caso se puede usar un método de aprendizaje no supervisado como clustering que consiste en partir las observaciones en grupos bien separados llamados clusters.\n",
......@@ -576,14 +661,14 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]\n",
"[1 1 1 1 1 0 0 0 0 0 2 2 2 2 2]\n",
"[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]\n"
]
}
......@@ -729,22 +814,524 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Métricas de evaluación"
"# 8. Métricas de Evaluación en Minería de Datos\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"https://docs.google.com/presentation/d/1kjkdn_m0jZjLYmp4GuPyXtDwi9wdUKPkQISXhQD5cCk/edit?usp=sharing\n",
"\n",
"\n",
"\n",
"#### 8.1 Técnicas de evaluación en Aprendizaje Automático:\n",
" - Motivación SPAM!\n",
" - Gold Standard\n",
" - Tabla de contingencia\n",
" - Exactitud (Accuracy)\n",
" - Precisión\n",
" - Exhaustividad (Recall)\n",
"\n",
"#### 8.2 Validación de evaluación en Aprendizaje Automático:\n",
" - Conjunto de entrenamiento, Conjunto de prueba\n",
" - Validación cruzada\n",
" \n",
" \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejemplos completos en casos de uso"
"# Ejemplos completos en sklearn"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 178,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/plain": [
"0.98"
]
},
"execution_count": 178,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cada estimador (clasificador) debe exponer un método \"score\" que nos indica la calidad de predicción.\n",
"\n",
"from sklearn import datasets, svm\n",
"\n",
"digits = datasets.load_digits()\n",
"X_digits = digits.data\n",
"y_digits = digits.target\n",
"svc = svm.SVC(C=1, kernel='linear')\n",
"svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])\n"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'numpy.ndarray'>\n",
"[0.9348914858096828, 0.9565943238731218, 0.9398998330550918]\n"
]
}
],
"source": [
"# Para tener una mejor estimación del modelo, podemos dividir los datos que usamos para entrenamiento y test.\n",
"# A esto se le llama validación cruzada\n",
"import numpy as np\n",
"\n",
"X_folds = np.array_split(X_digits, 3)\n",
"y_folds = np.array_split(y_digits, 3)\n",
"scores = list()\n",
"\n",
"for k in range(3):\n",
" # We use 'list' to copy, in order to 'pop' later on\n",
" X_train = list(X_folds)\n",
" X_test = X_train.pop(k)\n",
" X_train = np.concatenate(X_train)\n",
" y_train = list(y_folds)\n",
" y_test = y_train.pop(k)\n",
" y_train = np.concatenate(y_train)\n",
" scores.append(svc.fit(X_train, y_train).score(X_test, y_test))\n",
"print(scores) \n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train: [2 3 4 5 6 7 8 9] | test: [0 1]\n",
"Train: [0 1 4 5 6 7 8 9] | test: [2 3]\n",
"Train: [0 1 2 3 6 7 8 9] | test: [4 5]\n",
"Train: [0 1 2 3 4 5 8 9] | test: [6 7]\n",
"Train: [0 1 2 3 4 5 6 7] | test: [8 9]\n"
]
},
{
"data": {
"text/plain": [
"array([0.96578289, 0.92708922, 0.96681476, 0.96362897, 0.93192644])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Ejemplo de uso de la métrica de precisión_macro y Kfolds 5\n",
"\n",
"from sklearn.model_selection import KFold, cross_val_score\n",
"\n",
"X = [\"a\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\", \"c\", \"c\", \"c\"]\n",
"\n",
"k_fold = KFold(n_splits=5)\n",
"for train_indices, test_indices in k_fold.split(X):\n",
" print('Train: %s | test: %s' % (train_indices, test_indices))\n",
"\n",
"cross_val_score(svc, X_digits, y_digits, cv=k_fold,\n",
" scoring='precision_macro')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejercicios"
]
},
{
"cell_type": "code",
"execution_count": 297,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Para asegurar la entrega de nuestros e-mail en su correo, por favor agregue mkt@superboletos.com a su libreta de direcciones de correo.Si no visualiza bien este mensaje, haga clic aquí\n",
"\n",
"\n",
"Hola Jóvenes. Les reenvío el correo de la Maestra Rosa Peralta para la actividad que tienen pendiente con ella. Aprovecho para agradecer el esfuerzo y el empaño para con estas sesiones de clase.\n",
"\n",
"\n",
"Hola! Pues eres parte del laboratorio si así lo quieres, justo uno de los principios fundamentales es que aquí todos estemos por gusto. Me gustaría muchísimo que trabajaras con nosotros y, en ese caso, ya tengo pensada tu primera tarea jajajaja. Sería ayudar a organizar el material de un curso de geoinformática, lo que hay que hacer es actualizar las prácticas y hacer una linda paginita para el curso. Si te animas platicamos en la semana.\n"
]
}
],
"source": [
"import json\n",
"with open('data/Spam.json') as json_file: \n",
" dataS = json.load(json_file)\n",
" \n",
"print(dataS[0][\"texto\"])\n",
"\n",
"with open('data/Ham.json') as json1_file:\n",
" dataH=json.load(json1_file)\n",
"\n",
"print(\"\\n\")\n",
"print(dataH[0][\"texto\"])\n",
"\n",
"with open('data/Test.json') as json2_file:\n",
" dataT=json.load(json2_file)\n",
"\n",
"print(\"\\n\")\n",
"print(dataT[0][\"texto\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##  1. SpamDetector\n",
"\n",
"Implemente la clase SpamDetector usando sklearn. La clase debe ser un estimador para la clasificación binaria de textos (correo electrónico) en dos categorías: \"HAM\", \"SPAM\". El constructor debe recibir los textos de cada categoría así como sus respectivas etiquetas. Debe implementar los métodos $fit(X, y)$ y $predict(T)$, donde $X$ es una matriz de $(nDocumentos,mPalabras)$ cuyos valores corresponden a las frecuencias. \n",
"\n",
"\n",
"Utilice como datos de ejemplo, una selección de su propio correo electrónico personal.\n"
]
},
{
"cell_type": "code",
"execution_count": 399,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"63\n",
"267\n",
"307\n",
"[1 1 1 1 1 0 0 0 0 0]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:152: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n"
]
}
],
"source": [
"import re\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import svm, datasets\n",
"import pickle\n",
"\n",
"class DetectorSpam():\n",
" def __init__(self,listaH,listaS):\n",
" self.listaH=listaH\n",
" self.listaS=listaS\n",
" self.lista_pal_H=[]\n",
" self.lista_pal_S=[]\n",
" self.lista_pal_T=[]\n",
" #self.lista_pal_Test=[]\n",
" self.lista_label_y=[]\n",
" self.X=None\n",
" self.y=None\n",
" \n",
" def y_label(self):\n",
" for clase in self.listaS:\n",
" self.lista_label_y.append(clase[\"spam\"])\n",
" \n",
" for clase in self.listaH:\n",
" self.lista_label_y.append(clase[\"spam\"])\n",
" \n",
" return(self.lista_label_y)\n",
" \n",
" \n",
" def terminos(self):\n",
" '''\n",
" Función que calcula obtiene todas las palabras existentes en los n textos de la lista de textos\n",
" \n",
" Parameters:\n",
" None\n",
" \n",
" Returns:\n",
" self.lista_pal(list): Lista con todos las palabras sin repeticiones de los n textos\n",
" '''\n",
" for textos in self.listaS:\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" self.lista_pal_S.append(palabra)\n",
" \n",
" self.lista_pal_S = list( dict.fromkeys(self.lista_pal_S))\n",
" \n",
" for textos in self.listaH:\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" self.lista_pal_H.append(palabra)\n",
" \n",
" self.lista_pal_H = list( dict.fromkeys(self.lista_pal_H))\n",
" self.lista_pal_T=self.lista_pal_S+self.lista_pal_H\n",
" \n",
" ###Ultima inserción importante\n",
" self.lista_pal_T = list( dict.fromkeys(self.lista_pal_T))\n",
" #print(len(self.lista_pal_H))\n",
" #print(len(self.lista_pal_S))\n",
" #print(len(self.lista_pal_T))\n",
" return(0)\n",
" \n",
" def tf(self):\n",
" '''\n",
" Función que calcula el term frequency para n textos\n",
" \n",
" Parameters:\n",
" None\n",
" \n",
" Returns:\n",
" df(pandas.DataFrame): DataFrame de Pandas que contiene en las filas los textos y en\n",
" las columnas cada palabra. Para cada celda se calcula:\n",
" log(#palabras)+1\n",
" '''\n",
" #self.buscar()\n",
" self.terminos()\n",
" x=self.lista_pal_S\n",
" y=self.lista_pal_H\n",
" #self.buscar()\n",
" dfS=pd.DataFrame(columns=x)\n",
" dfS=dfS.T\n",
" \n",
" dfH=pd.DataFrame(columns=y)\n",
" dfH=dfH.T\n",
" \n",
" contador=0\n",
" for textos in self.listaS:\n",
" dictiS={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiS.update({palabra:splito.count(palabra)})\n",
" \n",
" #print(\"\\n\\n\")\n",
" #print(dicti.items()) \n",
" \n",
" for palabra in x:\n",
" if palabra in dictiS.keys():\n",
" dictiS.update({str(palabra):np.log(dictiS[str(palabra)])+1})\n",
" elif palabra not in dictiS.keys():\n",
" dictiS.update({str(palabra):0})\n",
" \n",
" dfS.insert(contador,\"Texto: \"+str(contador),dictiS.values())\n",
" contador=contador+1\n",
" \n",
" dfS=dfS.T\n",
" \n",
" contador=0\n",
" for textos in self.listaH:\n",
" dictiH={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiH.update({palabra:splito.count(palabra)})\n",
" \n",
" #print(\"\\n\\n\")\n",
" #print(dictiH.items()) \n",
" \n",
" for palabra in y:\n",
" if palabra in dictiH.keys():\n",
" dictiH.update({str(palabra):np.log(dictiH[str(palabra)])+1})\n",
" elif palabra not in dictiH.keys():\n",
" dictiH.update({str(palabra):0})\n",
" \n",
" dfH.insert(contador,\"Texto: \"+str(5+contador),dictiH.values())\n",
" contador=contador+1 \n",
" dfH=dfH.T\n",
" dfT=pd.concat([dfS,dfH])\n",
" dfT=dfT.fillna(0)\n",
" dfT.sort_index(axis=1, inplace=True)\n",
" return(dfT)\n",
" \n",
" def ajuste(self):\n",
" x=self.tf()\n",
" y=self.y_label()\n",
" #print(x)\n",
" a = np.array(x)\n",
" b= np.array(y)\n",
" #print(a[0:1].shape)\n",
" clf = svm.SVC(gamma='scale')\n",
" self.X, self.y = a, b\n",
" #print(type(X))\n",
" #print(a[0:1].shape)\n",
" #print(b[0:1].shape)\n",
" clf.fit(self.X, self.y) \n",
" #print(clf)\n",
" return(clf)\n",
" \n",
" def prediccion(self,test):\n",
" #print(test[0][\"texto\"])\n",
" self.terminos()\n",
" y=self.lista_pal_T\n",
" dfTest=pd.DataFrame(columns=y)\n",
" dfTest=dfTest.T\n",
" contador=0\n",
" \n",
" for elemento in test:\n",
" #print(elemento[\"texto\"])\n",
" dictiTest={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=elemento[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiTest.update({palabra:splito.count(palabra)})\n",
" print(len(dictiTest)) \n",
" print(len(y))\n",
" \n",
" for palabra in y:\n",
" if palabra in dictiTest.keys():\n",
" dictiTest.update({str(palabra):np.log(dictiTest[str(palabra)])+1}) \n",
" \n",
" elif palabra not in dictiTest.keys():\n",
" dictiTest.update({str(palabra):0})\n",
" \n",
" print(len(dictiTest))\n",
" #dfTest.insert(contador,\"Texto: \"+str(contador),dictiTest.values())\n",
" #contador=contador+1\n",
" #dfTest=dfTest.T\n",
" #dfTest.sort_index(axis=1, inplace=True)\n",
" #return(dfTest) \n",
" \n",
" x=self.ajuste()\n",
" s = pickle.dumps(x)\n",
" clf2 = pickle.loads(s)\n",
" #print(self.X[0:1])\n",
" #print(self.y[0:1])\n",
" #Cambiar[i+1:j+1]\n",
" #print(self.X.shape)\n",
" print(clf2.predict(self.X))\n",
" pass\n",
" \n",
"correo=DetectorSpam(dataH,dataS)\n",
"#correo.terminos()\n",
"#correo.lista_pal_H\n",
"#correo.tf()\n",
"#correo.listaS[1][\"spam\"]\n",
"#correo.y_label()\n",
"#correo.ajuste()\n",
"\n",
"correo.prediccion(dataT)"
]
},
{
"cell_type": "code",
"execution_count": 293,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'from sklearn.feature_extraction.text import CountVectorizer\\nlistax=[]\\nfor tx in dataS:\\n listax.append(tx[\"texto\"])\\n #x=x.split()\\nprint(listax[4])\\n#corpus = [\\n#\\'This is the first document.\\',\\n#\\'This document is the second document.\\',\\n#\\'And this is the third one.\\',\\n#\\'Is this the first document?\\',\\n#]\\ncorpus=listax\\nvectorizer = CountVectorizer()\\nX = vectorizer.fit_transform(corpus)\\nprint(vectorizer.get_feature_names())\\n#[\\'and\\', \\'document\\', \\'first\\', \\'is\\', \\'one\\', \\'second\\', \\'the\\', \\'third\\', \\'this\\']\\nprint(X.toarray()) '"
]
},
"execution_count": 293,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''from sklearn.feature_extraction.text import CountVectorizer\n",
"listax=[]\n",
"for tx in dataS:\n",
" listax.append(tx[\"texto\"])\n",
" #x=x.split()\n",
"print(listax[4])\n",
"#corpus = [\n",
"#'This is the first document.',\n",
"#'This document is the second document.',\n",
"#'And this is the third one.',\n",
"#'Is this the first document?',\n",
"#]\n",
"corpus=listax\n",
"vectorizer = CountVectorizer()\n",
"X = vectorizer.fit_transform(corpus)\n",
"print(vectorizer.get_feature_names())\n",
"#['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\n",
"print(X.toarray()) '''"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##  2. Validación SpamDetector\n",
"\n",
"Evalue la clase SpamDetector usando el esquema de validación cruzada con $F-measure$ como métrica de score y grafíque los valores de score para cada entrenamiento.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##  3. Persistencia SpamDetector\n",
"\n",
"Evalue la clase SpamDetector usando el esquema de validación cruzada con $F-measure$ como métrica de score y guarde, de manera persistente, en un archivo pickle el mejor estimador según los resultados de la evaluación."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##  4. Reuso SpamDetector\n",
"\n",
"Modifique la clase SpamDetector para que el constructor cargue un modelo previamente entrenado desde un archivo. Pruebe el código haciendo que se cargue el pickle del ejercicio anterior y realice la predicción de nuevos documentos no utilizados en el entrenamiento.\n"
]
}
],
"metadata": {
......
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 9. Procesamiento de Lenguaje Natural\n",
"\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"https://docs.google.com/presentation/d/1TYSbrhAfTvW8uuK1_nZiuoqgyOCkHxXrXCz9bVVP39Y/edit?usp=sharing\n",
"\n",
"\n",
"#### 9.1 Presentación de la Línea de investigación:\n",
" - Procesamiento de Lenguaje Natural\n",
" - Minería de Textos y Reconocimiento de patrones\n",
"\n",
"\n",
"#### 9.2 Presentación de dos estudios de caso:\n",
" - Recuperación, procesamiento y clasificación de tuits\n",
" - Reconocimiento de Entidades Nombradas Georeferenciables\n",
" \n",
"\n",
"#### 9.3 Instrumentos metodológicos:\n",
" - Datos y corpus lingüísticos como Instrumentos metodológicos de la Minería de Textos\n",
" - Técnicas de recolección de datos\n",
" - Repositorios\n",
"\t - Crawling\n",
" - Crowdsourcing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejemplos "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"### 20 Newsgroups\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20newsgroups.data.html\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"### Entidades Nombradas Georeferenciables\n",
"\n",
"http://ner.geoint.mx/\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejercicios"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"1. Generar una estructura de archivos y directorios similar a 20 Newsgroups pero con documentos en español y al menos cinco categorías\n",
"\n",
"2. Elejir y compilar al menos 100 documentos de cada categoría que serán utilizados en el proyecto final\n",
"\n",
"3. Subir el dataset generado al repositorio\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"1. Ingrese a la plataforma de anotación de misoginia (Crowdsourcing)\n",
"\n",
"2. Haga el tutorial de la plataforma\n",
"\n",
"3. Realice 100 anotaciones de tuits siguiendo las instrucciones\n",
"\n",
"http://etiquetamisoginia.geoint.mx/\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crawling\n",
"\n",
"REMERI es la Red Mexicana de Repositorios Institucionales\n",
"\n",
"El objetivo general de REMERI es integrar una red federada de Repositorios de acceso abierto de las Instituciones Mexicanas de Educación Superior (IES), con la finalidad de integrar, difundir, preservar y dar visibilidad a la producción científica, académica y documental del país.\n",
"\n",
"1. Ingrese a la plataforma REMERI y realice una búsqueda por palabra, por ejemplo: \"nopal\"\n",
"\n",
"http://www.remeri.org.mx/portal/REMERI.jsp?busca=nopal\n",
"\n",
"\n",
"2. Defina la Clase CrawlerRemeri() la cual tendra un método search(query, n=5) que realiza la búsqueda de la cadena query en REMERI y descarga n documentos resultantes de la búsqueda.\n",
"\n",
"3. modifique el método método search(query, n=5) para que cuando n sea negativo, descargue todos, los documentos resultantes de la búsqueda en REMERI\n",
"\n",
"\n",
"\n",
"#### Observaciones\n",
"\n",
"* utilice la biblioteca de python Requests para realizar las peticiones\n",
"* Sea cuidadoso ya que el sitio podría banear su IP en caso de que detecte un ataque"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Referencias \n",
"\n",
"Pérez C. et al. (2016). Recuperación, procesamiento y clasificación de tuits para visualizar estructuras de interacción. Research in Computing Science Journal, 124 (1), 23-37. http://www.rcs.cic.ipn.mx/2016_124/Recuperacion_%20procesamiento%20y%20clasificacion%20de%20tuits%20para%20visualizar%20estructuras%20de%20interaccion.pdf\n",
"\n",
"\n",
"T. Joachims (1996). A probabilistic analysis of the Rocchio algorithm with TFIDF for text categorization, Computer Science Technical Report CMU-CS-96-118. Carnegie Mellon University.\n",
"http://rexa.info/paper/7c077ad01b1a7f0605ca075ead0193d4555c2619\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -560,7 +560,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -591,14 +591,19 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CGarcia\n"
"Titulo de la nota\t04-06-19\n",
"CGarcia\n",
"Sección: Investigacion\n",
"La Jornada\n",
"\n",
"Este es el contenido de la nota\n"
]
}
],
......@@ -606,24 +611,24 @@
"miNota=Nota(\"Este es el contenido de la nota\",\"Investigacion\",\"CGarcia\",\"Titulo de la nota\")\n",
"Editorial=Fuente(\"La Jornada\", \"http://jornada.unam.mx\")\n",
"miNota.source=Editorial\n",
"print(miNota.author)"
"print(miNota)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Titulo de la nota\t02-05-19\n",
"CGarcia\n",
"Sección: Investigacion\n",
"La Jornada\n",
"\n",
"Este es el contenido de la nota\n"
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'nombre'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-ea7fff60b8dc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmiNota\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-2-e8982bf7334e>\u001b[0m in \u001b[0;36m__str__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthor\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m\"Sección: \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msection\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnombre\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnombre\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'nombre'"
]
}
],
......@@ -633,7 +638,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......
......@@ -560,9 +560,18 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 270,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'numpy.ndarray'>\n",
"[[4.7 3.2 1.3 0.2]]\n",
"(1,)\n"
]
},
{
"data": {
"text/plain": [
......@@ -572,19 +581,22 @@
" tol=0.001, verbose=False)"
]
},
"execution_count": 15,
"execution_count": 270,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import svm\n",
"from sklearn import svm, datasets\n",
"\n",
"\n",
"clf = svm.SVC(gamma='scale')\n",
"iris = datasets.load_iris()\n",
"X, y = iris.data, iris.target\n",
"\n",
"print(type(X))\n",
"print(X[2:3])\n",
"print(y[0:1].shape)\n",
"clf.fit(X, y) "
]
},
......@@ -834,16 +846,23 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 453,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1797\n"
]
},
{
"data": {
"text/plain": [
"0.98"
]
},
"execution_count": 1,
"execution_count": 453,
"metadata": {},
"output_type": "execute_result"
}
......@@ -857,18 +876,20 @@
"X_digits = digits.data\n",
"y_digits = digits.target\n",
"svc = svm.SVC(C=1, kernel='linear')\n",
"print(len(X_digits))\n",
"svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 180,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'numpy.ndarray'>\n",
"[0.9348914858096828, 0.9565943238731218, 0.9398998330550918]\n"
]
}
......@@ -943,6 +964,45 @@
"# Ejercicios"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Para asegurar la entrega de nuestros e-mail en su correo, por favor agregue mkt@superboletos.com a su libreta de direcciones de correo.Si no visualiza bien este mensaje, haga clic aquí\n",
"\n",
"\n",
"Hola Jóvenes. Les reenvío el correo de la Maestra Rosa Peralta para la actividad que tienen pendiente con ella. Aprovecho para agradecer el esfuerzo y el empaño para con estas sesiones de clase.\n",
"\n",
"\n",
" Buen día Sobrino y ahijado. En el archivo anexo encontraras un listado de las deducciones o gastos que son aceptados, para los que ganan por honorarios y que se utilizan en las declaraciones de pagos provisionales de cada mes. A continuación te enviare otro correo, con un desglose de las llamadas deducciones personales que aplican en las declaraciones anuales.\n"
]
}
],
"source": [
"import json\n",
"with open('data/Spam.json') as json_file: \n",
" dataS = json.load(json_file)\n",
" \n",
"print(dataS[0][\"texto\"])\n",
"\n",
"with open('data/Ham.json') as json1_file:\n",
" dataH=json.load(json1_file)\n",
"\n",
"print(\"\\n\")\n",
"print(dataH[0][\"texto\"])\n",
"\n",
"with open('data/Test.json') as json2_file:\n",
" dataT=json.load(json2_file)\n",
"\n",
"print(\"\\n\")\n",
"print(dataT[1][\"texto\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -955,6 +1015,289 @@
"Utilice como datos de ejemplo, una selección de su propio correo electrónico personal.\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:153: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"array(['H', 'H'], dtype='<U1')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import svm, datasets\n",
"import pickle\n",
"\n",
"class DetectorSpam():\n",
" def __init__(self,listaH,listaS):\n",
" self.listaH=listaH\n",
" self.listaS=listaS\n",
" self.lista_pal_H=[]\n",
" self.lista_pal_S=[]\n",
" self.lista_pal_T=[]\n",
" #self.lista_pal_Test=[]\n",
" self.lista_label_y=[]\n",
" self.X=None\n",
" self.y=None\n",
" self.clf=self.ajuste()\n",
" \n",
" def y_label(self):\n",
" for clase in self.listaS:\n",
" self.lista_label_y.append(clase[\"spam\"])\n",
" \n",
" for clase in self.listaH:\n",
" self.lista_label_y.append(clase[\"spam\"])\n",
" \n",
" return(self.lista_label_y)\n",
" \n",
" \n",
" def terminos(self):\n",
" '''\n",
" Función que calcula obtiene todas las palabras existentes en los n textos de la lista de textos\n",
" \n",
" Parameters:\n",
" None\n",
" \n",
" Returns:\n",
" self.lista_pal(list): Lista con todos las palabras sin repeticiones de los n textos\n",
" '''\n",
" for textos in self.listaS:\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" self.lista_pal_S.append(palabra)\n",
" \n",
" self.lista_pal_S = list( dict.fromkeys(self.lista_pal_S))\n",
" \n",
" for textos in self.listaH:\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" self.lista_pal_H.append(palabra)\n",
" \n",
" self.lista_pal_H = list( dict.fromkeys(self.lista_pal_H))\n",
" self.lista_pal_T=self.lista_pal_S+self.lista_pal_H\n",
" \n",
" ###Ultima inserción importante\n",
" self.lista_pal_T = list( dict.fromkeys(self.lista_pal_T))\n",
" #print(len(self.lista_pal_H))\n",
" #print(len(self.lista_pal_S))\n",
" #print(len(self.lista_pal_T))\n",
" return(0)\n",
" \n",
" def tf(self):\n",
" '''\n",
" Función que calcula el term frequency para n textos\n",
" \n",
" Parameters:\n",
" None\n",
" \n",
" Returns:\n",
" df(pandas.DataFrame): DataFrame de Pandas que contiene en las filas los textos y en\n",
" las columnas cada palabra. Para cada celda se calcula:\n",
" log(#palabras)+1\n",
" '''\n",
" #self.buscar()\n",
" self.terminos()\n",
" x=self.lista_pal_S\n",
" y=self.lista_pal_H\n",
" #self.buscar()\n",
" dfS=pd.DataFrame(columns=x)\n",
" dfS=dfS.T\n",
" \n",
" dfH=pd.DataFrame(columns=y)\n",
" dfH=dfH.T\n",
" \n",
" contador=0\n",
" for textos in self.listaS:\n",
" dictiS={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiS.update({palabra:splito.count(palabra)})\n",
" \n",
" #print(\"\\n\\n\")\n",
" #print(dicti.items()) \n",
" \n",
" for palabra in x:\n",
" if palabra in dictiS.keys():\n",
" dictiS.update({str(palabra):np.log(dictiS[str(palabra)])+1})\n",
" elif palabra not in dictiS.keys():\n",
" dictiS.update({str(palabra):0})\n",
" \n",
" dfS.insert(contador,\"Texto: \"+str(contador),dictiS.values())\n",
" contador=contador+1\n",
" \n",
" dfS=dfS.T\n",
" \n",
" contador=0\n",
" for textos in self.listaH:\n",
" dictiH={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=textos[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiH.update({palabra:splito.count(palabra)})\n",
" \n",
" #print(\"\\n\\n\")\n",
" #print(dictiH.items()) \n",
" \n",
" for palabra in y:\n",
" if palabra in dictiH.keys():\n",
" dictiH.update({str(palabra):np.log(dictiH[str(palabra)])+1})\n",
" elif palabra not in dictiH.keys():\n",
" dictiH.update({str(palabra):0})\n",
" \n",
" dfH.insert(contador,\"Texto: \"+str(5+contador),dictiH.values())\n",
" contador=contador+1 \n",
" dfH=dfH.T\n",
" dfT=pd.concat([dfS,dfH])\n",
" dfT=dfT.fillna(0)\n",
" dfT.sort_index(axis=1, inplace=True)\n",
" return(dfT)\n",
" \n",
" def ajuste(self):\n",
" x=self.tf()\n",
" y=self.y_label()\n",
" #print(x)\n",
" a = np.array(x)\n",
" b= np.array(y)\n",
" #print(a[0:1].shape)\n",
" self.clf = svm.SVC(gamma='scale',probability=True)\n",
" self.X, self.y = a, b\n",
" #print(type(X))\n",
" #print(a[0:1].shape)\n",
" #print(b[0:1].shape)\n",
" self.clf.fit(self.X, self.y) \n",
" #print(clf)\n",
" return(self.clf)\n",
" \n",
" def prediccion(self,test):\n",
" #print(test[0][\"texto\"])\n",
" self.terminos()\n",
" y=self.lista_pal_T\n",
" dfTest=pd.DataFrame(columns=y)\n",
" dfTest=dfTest.T\n",
" contador=0\n",
" \n",
" for elemento in test:\n",
" #print(elemento[\"texto\"])\n",
" dictiTest={}\n",
" #print(\"Entra texto \\n\")\n",
" completo=elemento[\"texto\"]\n",
" completo=re.sub(r\"[,| \\. | :|?|=|«|»| –|(|)|']\",\" \",completo)\n",
" completo=completo.lower()\n",
" #Sin puntuacion y en minusculas\n",
" splito=completo.split()\n",
" #print(splito)\n",
" \n",
" for palabra in splito:\n",
" dictiTest.update({palabra:splito.count(palabra)})\n",
" #print(len(dictiTest)) \n",
" #print(len(y))\n",
" \n",
" for palabra in y:\n",
" if palabra in dictiTest.keys():\n",
" dictiTest.update({str(palabra):np.log(dictiTest[str(palabra)])+1}) \n",
" \n",
" elif palabra not in dictiTest.keys():\n",
" dictiTest.update({str(palabra):0})\n",
" \n",
" #print(len(dictiTest))\n",
" xxx={}\n",
" for palabra in y:\n",
" for k in dictiTest.keys():\n",
" if palabra==k:\n",
" xxx.update({str(palabra):dictiTest[str(palabra)]})\n",
" \n",
" #print(len(xxx))\n",
" dfTest.insert(contador,\"Texto: \"+str(contador),xxx.values())\n",
" contador=contador+1\n",
" dfTest=dfTest.T\n",
" dfTest.sort_index(axis=1, inplace=True)\n",
" #return(dfTest)\n",
" \n",
" aaa = np.array(dfTest)\n",
" #print(aaa[0:1])\n",
" #x=self.ajuste()\n",
" s = pickle.dumps(self.clf)\n",
" #print(self.clf)\n",
" clf2 = pickle.loads(s)\n",
" #print(self.X[0:1])\n",
" #print(self.y[0:1])\n",
" #Cambiar[i+1:j+1]\n",
" #print(self.X.shape)\n",
" \n",
" #Cambiar para pronosticar otras cosas self.X para el train\n",
" #Probabilidades\n",
" \n",
" #print(clf2.predict_proba(self.X))\n",
" return(clf2.predict(aaa))\n",
"\n",
" def rank(self):\n",
" self.prediccion(dataT)\n",
" y=self.lista_label_y\n",
" x=self.X\n",
" svc2 = svm.SVC(gamma='scale')\n",
" return(svc2.fit(x[:-1], y[:-1]).score(x[-1:], y[-1:]))\n",
" \n",
" \n",
"correo=DetectorSpam(dataH,dataS)\n",
"#correo.terminos()\n",
"#correo.lista_pal_H\n",
"#correo.tf()\n",
"#correo.listaS[1][\"spam\"]\n",
"#correo.y_label()\n",
"correo.prediccion(dataT)\n",
"#correo.rank()"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -965,6 +1308,13 @@
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4,8 +4,124 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Herramientas\n",
"## 3.1 Django (Instalacion, apps y vistas)"
"# 9. Procesamiento de Lenguaje Natural\n",
"\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"https://docs.google.com/presentation/d/1TYSbrhAfTvW8uuK1_nZiuoqgyOCkHxXrXCz9bVVP39Y/edit?usp=sharing\n",
"\n",
"\n",
"#### 9.1 Presentación de la Línea de investigación:\n",
" - Procesamiento de Lenguaje Natural\n",
" - Minería de Textos y Reconocimiento de patrones\n",
"\n",
"\n",
"#### 9.2 Presentación de dos estudios de caso:\n",
" - Recuperación, procesamiento y clasificación de tuits\n",
" - Reconocimiento de Entidades Nombradas Georeferenciables\n",
" \n",
"\n",
"#### 9.3 Instrumentos metodológicos:\n",
" - Datos y corpus lingüísticos como Instrumentos metodológicos de la Minería de Textos\n",
" - Técnicas de recolección de datos\n",
" - Repositorios\n",
"\t - Crawling\n",
" - Crowdsourcing\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejemplos "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"### 20 Newsgroups\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20newsgroups.data.html\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"### Entidades Nombradas Georeferenciables\n",
"\n",
"http://ner.geoint.mx/\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejercicios"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"1. Generar una estructura de archivos y directorios similar a 20 Newsgroups pero con documentos en español y al menos cinco categorías\n",
"\n",
"2. Elejir y compilar al menos 100 documentos de cada categoría que serán utilizados en el proyecto final\n",
"\n",
"3. Subir el dataset generado al repositorio\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"1. Ingrese a la plataforma de anotación de misoginia (Crowdsourcing)\n",
"\n",
"2. Haga el tutorial de la plataforma\n",
"\n",
"3. Realice 100 anotaciones de tuits siguiendo las instrucciones\n",
"\n",
"http://etiquetamisoginia.geoint.mx/\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crawling\n",
"\n",
"REMERI es la Red Mexicana de Repositorios Institucionales\n",
"\n",
"El objetivo general de REMERI es integrar una red federada de Repositorios de acceso abierto de las Instituciones Mexicanas de Educación Superior (IES), con la finalidad de integrar, difundir, preservar y dar visibilidad a la producción científica, académica y documental del país.\n",
"\n",
"1. Ingrese a la plataforma REMERI y realice una búsqueda por palabra, por ejemplo: \"nopal\"\n",
"\n",
"http://www.remeri.org.mx/portal/REMERI.jsp?busca=nopal\n",
"\n",
"\n",
"2. Defina la Clase CrawlerRemeri() la cual tendra un método search(query, n=5) que realiza la búsqueda de la cadena query en REMERI y descarga n documentos resultantes de la búsqueda.\n",
"\n",
"3. modifique el método método search(query, n=5) para que cuando n sea negativo, descargue todos, los documentos resultantes de la búsqueda en REMERI\n",
"\n",
"\n",
"\n",
"#### Observaciones\n",
"\n",
"* utilice la biblioteca de python Requests para realizar las peticiones\n",
"* Sea cuidadoso ya que el sitio podría banear su IP en caso de que detecte un ataque"
]
},
{
......@@ -19,7 +135,1193 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3.2 Django (Templates y HTML5)"
"# 9. Procesamiento de Lenguaje Natural parte 2\n",
"\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"\n",
"https://docs.google.com/presentation/d/1WCVA9bMu12rfQDSg5guPXg6FkLgKmRsUM9aptMJ4Z1s/edit?usp=sharing\n",
"\n",
"\n",
" \n",
"#### 9.4 Codificación textual:\n",
"\n",
" - Bolsa de palabras\n",
" - Modelo Vectorial\n",
"\n",
"\n",
"#### 9.5 Similitud Textual con producto coseno\n",
"\n",
"\n",
"#### 9.6 Técnicas de Análisis de Textos con Aprendizaje Automático en estudios de caso:\n",
"\n",
"\n",
" - Clasificación\n",
" - Agrupamiento (Clustering)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modelo vectorial en sklearn"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer = CountVectorizer()\n",
"vectorizer "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 1)\t1\n",
" (0, 6)\t1\n",
" (0, 2)\t1\n",
" (0, 4)\t1\n",
" (0, 5)\t1\n",
" (1, 7)\t1\n",
" (1, 1)\t1\n",
" (1, 2)\t1\n",
" (1, 4)\t1\n",
" (1, 5)\t1\n",
" (2, 8)\t1\n",
" (2, 2)\t1\n",
" (3, 3)\t1\n",
" (3, 0)\t1\n",
" (3, 6)\t1\n",
" (3, 2)\t1\n",
" (3, 4)\t1\n",
" (3, 5)\t1\n"
]
}
],
"source": [
"# ejemplo de un modelo vectorial minimalista\n",
"\n",
"corpus = [\n",
" 'Este es el primer documento.',\n",
" 'Este es el segundo documento.',\n",
" 'Y el tercero.',\n",
" 'Acaso este es el primer elemento?',\n",
"]\n",
"X = vectorizer.fit_transform(corpus)\n",
"print(X) \n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['este', 'es', 'un', 'documento', 'analizar']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"analyze = vectorizer.build_analyzer()\n",
"analyze(\"Este es un documento a analizar.\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['acaso',\n",
" 'documento',\n",
" 'el',\n",
" 'elemento',\n",
" 'es',\n",
" 'este',\n",
" 'primer',\n",
" 'segundo',\n",
" 'tercero']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.vocabulary_.get('documento')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['los',\n",
" 'ngramas',\n",
" 'son',\n",
" 'muy',\n",
" 'chidos',\n",
" 'los ngramas',\n",
" 'ngramas son',\n",
" 'son muy',\n",
" 'muy chidos',\n",
" 'los ngramas son',\n",
" 'ngramas son muy',\n",
" 'son muy chidos']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1)\n",
"analyze = bigram_vectorizer.build_analyzer()\n",
"analyze('Los ngramas son muy chidos')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modelo vectorial tf-idf "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,\n",
" use_idf=True)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"transformer = TfidfTransformer(smooth_idf=False)\n",
"transformer \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Ejemplo usando conteos de palabras. La primera palabra está presente en el 100% de los documentos y por lo tanto, consideramos no muy importante."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0.53409337, 0.31544415, 0. , 0.40619178,\n",
" 0.40619178, 0.53409337, 0. , 0. ],\n",
" [0. , 0.47182467, 0.27866725, 0. , 0.35883482,\n",
" 0.35883482, 0. , 0.66498209, 0. ],\n",
" [0. , 0. , 0.38649524, 0. , 0. ,\n",
" 0. , 0. , 0. , 0.9222914 ],\n",
" [0.55372869, 0. , 0.23204543, 0.55372869, 0.29880073,\n",
" 0.29880073, 0.39288706, 0. , 0. ]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts = [[3, 0, 1],\n",
" [2, 0, 0],\n",
" [3, 0, 0],\n",
" [4, 0, 0],\n",
" [3, 2, 0],\n",
" [3, 0, 2]]\n",
"\n",
"tfidf = transformer.fit_transform(X)\n",
"tfidf \n",
"\n",
"\n",
"\n",
"tfidf.toarray() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 5)\t0.4181266243877562\n",
" (0, 4)\t0.4181266243877562\n",
" (0, 2)\t0.3418459132932508\n",
" (0, 6)\t0.5164695651831305\n",
" (0, 1)\t0.5164695651831305\n",
" (1, 5)\t0.3878225151467608\n",
" (1, 4)\t0.3878225151467608\n",
" (1, 2)\t0.3170703183040649\n",
" (1, 1)\t0.4790379614294201\n",
" (1, 7)\t0.6075989123184679\n",
" (2, 2)\t0.46263733109032296\n",
" (2, 8)\t0.8865476297873808\n",
" (3, 5)\t0.3314387711719163\n",
" (3, 4)\t0.3314387711719163\n",
" (3, 2)\t0.2709729130450805\n",
" (3, 6)\t0.4093928203750212\n",
" (3, 0)\t0.519262881857229\n",
" (3, 3)\t0.519262881857229\n"
]
}
],
"source": [
"# Como tf-idf es muy común para representar documentos, existe la clase \n",
"# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer()\n",
"z=vectorizer.fit_transform(corpus)\n",
"\n",
"print(z)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ejemplo completo Clasificación de 20 newsgroups dataset"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading 20news dataset. This may take a few minutes.\n",
"2019-04-01 20:56:53,543 INFO Downloading 20news dataset. This may take a few minutes.\n",
"Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n",
"2019-04-01 20:56:53,548 INFO Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Automatically created module for IPython interactive environment\n",
"Usage: ipykernel_launcher.py [options]\n",
"\n",
"Options:\n",
" -h, --help show this help message and exit\n",
" --report Print a detailed classification report.\n",
" --chi2_select=SELECT_CHI2\n",
" Select some number of features using a chi-squared\n",
" test\n",
" --confusion_matrix Print the confusion matrix.\n",
" --top10 Print ten most discriminative terms per class for\n",
" every classifier.\n",
" --all_categories Whether to use all categories or not.\n",
" --use_hashing Use a hashing vectorizer.\n",
" --n_features=N_FEATURES\n",
" n_features when using the hashing vectorizer.\n",
" --filtered Remove newsgroup information that is easily overfit:\n",
" headers, signatures, and quoting.\n",
"\n",
"Loading 20 newsgroups dataset for categories:\n",
"['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
"data loaded\n",
"2034 documents - 3.980MB (training set)\n",
"1353 documents - 2.867MB (test set)\n",
"4 categories\n",
"\n",
"Extracting features from the training data using a sparse vectorizer\n",
"done in 0.737494s at 5.396MB/s\n",
"n_samples: 2034, n_features: 33809\n",
"\n",
"Extracting features from the test data using the same vectorizer\n",
"done in 0.422445s at 6.788MB/s\n",
"n_samples: 1353, n_features: 33809\n",
"\n",
"================================================================================\n",
"Ridge Classifier\n",
"________________________________________________________________________________\n",
"Training: \n",
"RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,\n",
" max_iter=None, normalize=False, random_state=None, solver='sag',\n",
" tol=0.01)\n",
"train time: 0.235s\n",
"test time: 0.006s\n",
"accuracy: 0.896\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"================================================================================\n",
"Perceptron\n",
"________________________________________________________________________________\n",
"Training: \n",
"Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,\n",
" fit_intercept=True, max_iter=50, n_iter=None, n_iter_no_change=5,\n",
" n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.027s\n",
"test time: 0.003s\n",
"accuracy: 0.888\n",
"dimensionality: 33809\n",
"density: 0.240114\n",
"\n",
"\n",
"================================================================================\n",
"Passive-Aggressive\n",
"________________________________________________________________________________\n",
"Training: \n",
"PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,\n",
" early_stopping=False, fit_intercept=True, loss='hinge',\n",
" max_iter=50, n_iter=None, n_iter_no_change=5, n_jobs=None,\n",
" random_state=None, shuffle=True, tol=0.001,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.031s\n",
"test time: 0.003s\n",
"accuracy: 0.905\n",
"dimensionality: 33809\n",
"density: 0.716584\n",
"\n",
"\n",
"================================================================================\n",
"kNN\n",
"________________________________________________________________________________\n",
"Training: \n",
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=10, p=2,\n",
" weights='uniform')\n",
"train time: 0.005s\n",
"test time: 0.260s\n",
"accuracy: 0.858\n",
"\n",
"================================================================================\n",
"Random forest\n",
"________________________________________________________________________________\n",
"Training: \n",
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)\n",
"train time: 1.827s\n",
"test time: 0.154s\n",
"accuracy: 0.827\n",
"\n",
"================================================================================\n",
"L2 penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l2', random_state=None, tol=0.001,\n",
" verbose=0)\n",
"train time: 0.209s\n",
"test time: 0.002s\n",
"accuracy: 0.900\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.171s\n",
"test time: 0.002s\n",
"accuracy: 0.903\n",
"dimensionality: 33809\n",
"density: 0.664172\n",
"\n",
"\n",
"================================================================================\n",
"L1 penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
" verbose=0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:183: FutureWarning: max_iter and tol parameters have been added in SGDClassifier in 0.19. If max_iter is set but tol is left unset, the default value for tol in 0.19 and 0.20 will be None (which is equivalent to -infinity, so it has no effect) but will change in 0.21 to 1e-3. Specify tol to silence this warning.\n",
" FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"train time: 0.374s\n",
"test time: 0.004s\n",
"accuracy: 0.873\n",
"dimensionality: 33809\n",
"density: 0.005561\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.487s\n",
"test time: 0.002s\n",
"accuracy: 0.882\n",
"dimensionality: 33809\n",
"density: 0.020387\n",
"\n",
"\n",
"================================================================================\n",
"Elastic-Net penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.625s\n",
"test time: 0.004s\n",
"accuracy: 0.899\n",
"dimensionality: 33809\n",
"density: 0.188648\n",
"\n",
"\n",
"================================================================================\n",
"NearestCentroid (aka Rocchio classifier)\n",
"________________________________________________________________________________\n",
"Training: \n",
"NearestCentroid(metric='euclidean', shrink_threshold=None)\n",
"train time: 0.020s\n",
"test time: 0.005s\n",
"accuracy: 0.855\n",
"\n",
"================================================================================\n",
"Naive Bayes\n",
"________________________________________________________________________________\n",
"Training: \n",
"MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)\n",
"train time: 0.011s\n",
"test time: 0.002s\n",
"accuracy: 0.899\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)\n",
"train time: 0.014s\n",
"test time: 0.012s\n",
"accuracy: 0.884\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False)\n",
"train time: 0.012s\n",
"test time: 0.002s\n",
"accuracy: 0.911\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"================================================================================\n",
"LinearSVC with L1-based feature selection\n",
"________________________________________________________________________________\n",
"Training: \n",
"Pipeline(memory=None,\n",
" steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
" verbose=0),\n",
" max_features=None, no...ax_iter=1000,\n",
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
" verbose=0))])\n",
"train time: 0.340s\n",
"test time: 0.005s\n",
"accuracy: 0.880\n",
"\n"
]
},
{
"data": {
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
"# Olivier Grisel <olivier.grisel@ensta.org>\n",
"# Mathieu Blondel <mathieu@mblondel.org>\n",
"# Lars Buitinck\n",
"# License: BSD 3 clause\n",
"\n",
"from __future__ import print_function\n",
"\n",
"import logging\n",
"import numpy as np\n",
"from optparse import OptionParser\n",
"import sys\n",
"from time import time\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.feature_selection import SelectKBest, chi2\n",
"from sklearn.linear_model import RidgeClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.linear_model import Perceptron\n",
"from sklearn.linear_model import PassiveAggressiveClassifier\n",
"from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.neighbors import NearestCentroid\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.utils.extmath import density\n",
"from sklearn import metrics\n",
"\n",
"\n",
"# Display progress logs on stdout\n",
"logging.basicConfig(level=logging.INFO,\n",
" format='%(asctime)s %(levelname)s %(message)s')\n",
"\n",
"\n",
"# parse commandline arguments\n",
"op = OptionParser()\n",
"op.add_option(\"--report\",\n",
" action=\"store_true\", dest=\"print_report\",\n",
" help=\"Print a detailed classification report.\")\n",
"op.add_option(\"--chi2_select\",\n",
" action=\"store\", type=\"int\", dest=\"select_chi2\",\n",
" help=\"Select some number of features using a chi-squared test\")\n",
"op.add_option(\"--confusion_matrix\",\n",
" action=\"store_true\", dest=\"print_cm\",\n",
" help=\"Print the confusion matrix.\")\n",
"op.add_option(\"--top10\",\n",
" action=\"store_true\", dest=\"print_top10\",\n",
" help=\"Print ten most discriminative terms per class\"\n",
" \" for every classifier.\")\n",
"op.add_option(\"--all_categories\",\n",
" action=\"store_true\", dest=\"all_categories\",\n",
" help=\"Whether to use all categories or not.\")\n",
"op.add_option(\"--use_hashing\",\n",
" action=\"store_true\",\n",
" help=\"Use a hashing vectorizer.\")\n",
"op.add_option(\"--n_features\",\n",
" action=\"store\", type=int, default=2 ** 16,\n",
" help=\"n_features when using the hashing vectorizer.\")\n",
"op.add_option(\"--filtered\",\n",
" action=\"store_true\",\n",
" help=\"Remove newsgroup information that is easily overfit: \"\n",
" \"headers, signatures, and quoting.\")\n",
"\n",
"\n",
"def is_interactive():\n",
" return not hasattr(sys.modules['__main__'], '__file__')\n",
"\n",
"\n",
"# work-around for Jupyter notebook and IPython console\n",
"argv = [] if is_interactive() else sys.argv[1:]\n",
"(opts, args) = op.parse_args(argv)\n",
"if len(args) > 0:\n",
" op.error(\"this script takes no arguments.\")\n",
" sys.exit(1)\n",
"\n",
"print(__doc__)\n",
"op.print_help()\n",
"print()\n",
"\n",
"\n",
"# #############################################################################\n",
"# Load some categories from the training set\n",
"if opts.all_categories:\n",
" categories = None\n",
"else:\n",
" categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"\n",
"if opts.filtered:\n",
" remove = ('headers', 'footers', 'quotes')\n",
"else:\n",
" remove = ()\n",
"\n",
"print(\"Loading 20 newsgroups dataset for categories:\")\n",
"print(categories if categories else \"all\")\n",
"\n",
"data_train = fetch_20newsgroups(subset='train', categories=categories,\n",
" shuffle=True, random_state=42,\n",
" remove=remove)\n",
"\n",
"data_test = fetch_20newsgroups(subset='test', categories=categories,\n",
" shuffle=True, random_state=42,\n",
" remove=remove)\n",
"print('data loaded')\n",
"\n",
"# order of labels in `target_names` can be different from `categories`\n",
"target_names = data_train.target_names\n",
"\n",
"\n",
"def size_mb(docs):\n",
" return sum(len(s.encode('utf-8')) for s in docs) / 1e6\n",
"\n",
"\n",
"data_train_size_mb = size_mb(data_train.data)\n",
"data_test_size_mb = size_mb(data_test.data)\n",
"\n",
"print(\"%d documents - %0.3fMB (training set)\" % (\n",
" len(data_train.data), data_train_size_mb))\n",
"print(\"%d documents - %0.3fMB (test set)\" % (\n",
" len(data_test.data), data_test_size_mb))\n",
"print(\"%d categories\" % len(target_names))\n",
"print()\n",
"\n",
"# split a training set and a test set\n",
"y_train, y_test = data_train.target, data_test.target\n",
"\n",
"print(\"Extracting features from the training data using a sparse vectorizer\")\n",
"t0 = time()\n",
"if opts.use_hashing:\n",
" vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,\n",
" n_features=opts.n_features)\n",
" X_train = vectorizer.transform(data_train.data)\n",
"else:\n",
" vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n",
" stop_words='english')\n",
" X_train = vectorizer.fit_transform(data_train.data)\n",
"duration = time() - t0\n",
"print(\"done in %fs at %0.3fMB/s\" % (duration, data_train_size_mb / duration))\n",
"print(\"n_samples: %d, n_features: %d\" % X_train.shape)\n",
"print()\n",
"\n",
"print(\"Extracting features from the test data using the same vectorizer\")\n",
"t0 = time()\n",
"X_test = vectorizer.transform(data_test.data)\n",
"duration = time() - t0\n",
"print(\"done in %fs at %0.3fMB/s\" % (duration, data_test_size_mb / duration))\n",
"print(\"n_samples: %d, n_features: %d\" % X_test.shape)\n",
"print()\n",
"\n",
"# mapping from integer feature name to original token string\n",
"if opts.use_hashing:\n",
" feature_names = None\n",
"else:\n",
" feature_names = vectorizer.get_feature_names()\n",
"\n",
"if opts.select_chi2:\n",
" print(\"Extracting %d best features by a chi-squared test\" %\n",
" opts.select_chi2)\n",
" t0 = time()\n",
" ch2 = SelectKBest(chi2, k=opts.select_chi2)\n",
" X_train = ch2.fit_transform(X_train, y_train)\n",
" X_test = ch2.transform(X_test)\n",
" if feature_names:\n",
" # keep selected feature names\n",
" feature_names = [feature_names[i] for i\n",
" in ch2.get_support(indices=True)]\n",
" print(\"done in %fs\" % (time() - t0))\n",
" print()\n",
"\n",
"if feature_names:\n",
" feature_names = np.asarray(feature_names)\n",
"\n",
"\n",
"def trim(s):\n",
" \"\"\"Trim string to fit on terminal (assuming 80-column display)\"\"\"\n",
" return s if len(s) <= 80 else s[:77] + \"...\"\n",
"\n",
"\n",
"# #############################################################################\n",
"# Benchmark classifiers\n",
"def benchmark(clf):\n",
" print('_' * 80)\n",
" print(\"Training: \")\n",
" print(clf)\n",
" t0 = time()\n",
" clf.fit(X_train, y_train)\n",
" train_time = time() - t0\n",
" print(\"train time: %0.3fs\" % train_time)\n",
"\n",
" t0 = time()\n",
" pred = clf.predict(X_test)\n",
" test_time = time() - t0\n",
" print(\"test time: %0.3fs\" % test_time)\n",
"\n",
" score = metrics.accuracy_score(y_test, pred)\n",
" print(\"accuracy: %0.3f\" % score)\n",
"\n",
" if hasattr(clf, 'coef_'):\n",
" print(\"dimensionality: %d\" % clf.coef_.shape[1])\n",
" print(\"density: %f\" % density(clf.coef_))\n",
"\n",
" if opts.print_top10 and feature_names is not None:\n",
" print(\"top 10 keywords per class:\")\n",
" for i, label in enumerate(target_names):\n",
" top10 = np.argsort(clf.coef_[i])[-10:]\n",
" print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n",
" print()\n",
"\n",
" if opts.print_report:\n",
" print(\"classification report:\")\n",
" print(metrics.classification_report(y_test, pred,\n",
" target_names=target_names))\n",
"\n",
" if opts.print_cm:\n",
" print(\"confusion matrix:\")\n",
" print(metrics.confusion_matrix(y_test, pred))\n",
"\n",
" print()\n",
" clf_descr = str(clf).split('(')[0]\n",
" return clf_descr, score, train_time, test_time\n",
"\n",
"\n",
"results = []\n",
"for clf, name in (\n",
" (RidgeClassifier(tol=1e-2, solver=\"sag\"), \"Ridge Classifier\"),\n",
" (Perceptron(max_iter=50, tol=1e-3), \"Perceptron\"),\n",
" (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),\n",
" \"Passive-Aggressive\"),\n",
" (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n",
" (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n",
" print('=' * 80)\n",
" print(name)\n",
" results.append(benchmark(clf))\n",
"\n",
"for penalty in [\"l2\", \"l1\"]:\n",
" print('=' * 80)\n",
" print(\"%s penalty\" % penalty.upper())\n",
" # Train Liblinear model\n",
" results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n",
" tol=1e-3)))\n",
"\n",
" # Train SGD model\n",
" results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
" penalty=penalty)))\n",
"\n",
"# Train SGD with Elastic Net penalty\n",
"print('=' * 80)\n",
"print(\"Elastic-Net penalty\")\n",
"results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
" penalty=\"elasticnet\")))\n",
"\n",
"# Train NearestCentroid without threshold\n",
"print('=' * 80)\n",
"print(\"NearestCentroid (aka Rocchio classifier)\")\n",
"results.append(benchmark(NearestCentroid()))\n",
"\n",
"# Train sparse Naive Bayes classifiers\n",
"print('=' * 80)\n",
"print(\"Naive Bayes\")\n",
"results.append(benchmark(MultinomialNB(alpha=.01)))\n",
"results.append(benchmark(BernoulliNB(alpha=.01)))\n",
"results.append(benchmark(ComplementNB(alpha=.1)))\n",
"\n",
"print('=' * 80)\n",
"print(\"LinearSVC with L1-based feature selection\")\n",
"# The smaller C, the stronger the regularization.\n",
"# The more regularization, the more sparsity.\n",
"results.append(benchmark(Pipeline([\n",
" ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n",
" tol=1e-3))),\n",
" ('classification', LinearSVC(penalty=\"l2\"))])))\n",
"\n",
"# make some plots\n",
"\n",
"indices = np.arange(len(results))\n",
"\n",
"results = [[x[i] for x in results] for i in range(4)]\n",
"\n",
"clf_names, score, training_time, test_time = results\n",
"training_time = np.array(training_time) / np.max(training_time)\n",
"test_time = np.array(test_time) / np.max(test_time)\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"plt.title(\"Score\")\n",
"plt.barh(indices, score, .2, label=\"score\", color='navy')\n",
"plt.barh(indices + .3, training_time, .2, label=\"training time\",\n",
" color='c')\n",
"plt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\n",
"plt.yticks(())\n",
"plt.legend(loc='best')\n",
"plt.subplots_adjust(left=.25)\n",
"plt.subplots_adjust(top=.95)\n",
"plt.subplots_adjust(bottom=.05)\n",
"\n",
"for i, c in zip(indices, clf_names):\n",
" plt.text(-.3, i, c)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ejemplo completo Clustering"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Automatically created module for IPython interactive environment\n",
"Usage: ipykernel_launcher.py [options]\n",
"\n",
"Options:\n",
" -h, --help show this help message and exit\n",
" --lsa=N_COMPONENTS Preprocess documents with latent semantic analysis.\n",
" --no-minibatch Use ordinary k-means algorithm (in batch mode).\n",
" --no-idf Disable Inverse Document Frequency feature weighting.\n",
" --use-hashing Use a hashing feature vectorizer\n",
" --n-features=N_FEATURES\n",
" Maximum number of features (dimensions) to extract\n",
" from text.\n",
" --verbose Print progress reports inside k-means algorithm.\n",
"Loading 20 newsgroups dataset for categories:\n",
"['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
"3387 documents\n",
"4 categories\n",
"\n",
"Extracting features from the training dataset using a sparse vectorizer\n",
"done in 1.281258s\n",
"n_samples: 3387, n_features: 10000\n",
"\n",
"Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',\n",
" init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,\n",
" n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,\n",
" verbose=False)\n",
"done in 0.094s\n",
"\n",
"Homogeneity: 0.596\n",
"Completeness: 0.651\n",
"V-measure: 0.623\n",
"Adjusted Rand-Index: 0.569\n",
"Silhouette Coefficient: 0.008\n",
"\n",
"Top terms per cluster:\n",
"Cluster 0: graphics image university thanks com files file 3d ac posting\n",
"Cluster 1: access digex henry pat toronto net com hst prb zoo\n",
"Cluster 2: space nasa gov alaska moon launch com shuttle just like\n",
"Cluster 3: god com people sandvik article don jesus say keith christian\n"
]
}
],
"source": [
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
"# Lars Buitinck\n",
"# License: BSD 3 clause\n",
"\n",
"from __future__ import print_function\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import Normalizer\n",
"from sklearn import metrics\n",
"\n",
"from sklearn.cluster import KMeans, MiniBatchKMeans\n",
"\n",
"import logging\n",
"from optparse import OptionParser\n",
"import sys\n",
"from time import time\n",
"\n",
"import numpy as np\n",
"\n",
"\n",
"# Display progress logs on stdout\n",
"logging.basicConfig(level=logging.INFO,\n",
" format='%(asctime)s %(levelname)s %(message)s')\n",
"\n",
"# parse commandline arguments\n",
"op = OptionParser()\n",
"op.add_option(\"--lsa\",\n",
" dest=\"n_components\", type=\"int\",\n",
" help=\"Preprocess documents with latent semantic analysis.\")\n",
"op.add_option(\"--no-minibatch\",\n",
" action=\"store_false\", dest=\"minibatch\", default=True,\n",
" help=\"Use ordinary k-means algorithm (in batch mode).\")\n",
"op.add_option(\"--no-idf\",\n",
" action=\"store_false\", dest=\"use_idf\", default=True,\n",
" help=\"Disable Inverse Document Frequency feature weighting.\")\n",
"op.add_option(\"--use-hashing\",\n",
" action=\"store_true\", default=False,\n",
" help=\"Use a hashing feature vectorizer\")\n",
"op.add_option(\"--n-features\", type=int, default=10000,\n",
" help=\"Maximum number of features (dimensions)\"\n",
" \" to extract from text.\")\n",
"op.add_option(\"--verbose\",\n",
" action=\"store_true\", dest=\"verbose\", default=False,\n",
" help=\"Print progress reports inside k-means algorithm.\")\n",
"\n",
"print(__doc__)\n",
"op.print_help()\n",
"\n",
"\n",
"def is_interactive():\n",
" return not hasattr(sys.modules['__main__'], '__file__')\n",
"\n",
"\n",
"# work-around for Jupyter notebook and IPython console\n",
"argv = [] if is_interactive() else sys.argv[1:]\n",
"(opts, args) = op.parse_args(argv)\n",
"if len(args) > 0:\n",
" op.error(\"this script takes no arguments.\")\n",
" sys.exit(1)\n",
"\n",
"\n",
"# #############################################################################\n",
"# Load some categories from the training set\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
"]\n",
"# Uncomment the following to do the analysis on all the categories\n",
"# categories = None\n",
"\n",
"print(\"Loading 20 newsgroups dataset for categories:\")\n",
"print(categories)\n",
"\n",
"dataset = fetch_20newsgroups(subset='all', categories=categories,\n",
" shuffle=True, random_state=42)\n",
"\n",
"print(\"%d documents\" % len(dataset.data))\n",
"print(\"%d categories\" % len(dataset.target_names))\n",
"print()\n",
"\n",
"labels = dataset.target\n",
"true_k = np.unique(labels).shape[0]\n",
"\n",
"print(\"Extracting features from the training dataset \"\n",
" \"using a sparse vectorizer\")\n",
"t0 = time()\n",
"if opts.use_hashing:\n",
" if opts.use_idf:\n",
" # Perform an IDF normalization on the output of HashingVectorizer\n",
" hasher = HashingVectorizer(n_features=opts.n_features,\n",
" stop_words='english', alternate_sign=False,\n",
" norm=None, binary=False)\n",
" vectorizer = make_pipeline(hasher, TfidfTransformer())\n",
" else:\n",
" vectorizer = HashingVectorizer(n_features=opts.n_features,\n",
" stop_words='english',\n",
" alternate_sign=False, norm='l2',\n",
" binary=False)\n",
"else:\n",
" vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n",
" min_df=2, stop_words='english',\n",
" use_idf=opts.use_idf)\n",
"X = vectorizer.fit_transform(dataset.data)\n",
"\n",
"print(\"done in %fs\" % (time() - t0))\n",
"print(\"n_samples: %d, n_features: %d\" % X.shape)\n",
"print()\n",
"\n",
"if opts.n_components:\n",
" print(\"Performing dimensionality reduction using LSA\")\n",
" t0 = time()\n",
" # Vectorizer results are normalized, which makes KMeans behave as\n",
" # spherical k-means for better results. Since LSA/SVD results are\n",
" # not normalized, we have to redo the normalization.\n",
" svd = TruncatedSVD(opts.n_components)\n",
" normalizer = Normalizer(copy=False)\n",
" lsa = make_pipeline(svd, normalizer)\n",
"\n",
" X = lsa.fit_transform(X)\n",
"\n",
" print(\"done in %fs\" % (time() - t0))\n",
"\n",
" explained_variance = svd.explained_variance_ratio_.sum()\n",
" print(\"Explained variance of the SVD step: {}%\".format(\n",
" int(explained_variance * 100)))\n",
"\n",
" print()\n",
"\n",
"\n",
"# #############################################################################\n",
"# Do the actual clustering\n",
"\n",
"if opts.minibatch:\n",
" km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n",
" init_size=1000, batch_size=1000, verbose=opts.verbose)\n",
"else:\n",
" km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n",
" verbose=opts.verbose)\n",
"\n",
"print(\"Clustering sparse data with %s\" % km)\n",
"t0 = time()\n",
"km.fit(X)\n",
"print(\"done in %0.3fs\" % (time() - t0))\n",
"print()\n",
"\n",
"print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n",
"print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n",
"print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n",
"print(\"Adjusted Rand-Index: %.3f\"\n",
" % metrics.adjusted_rand_score(labels, km.labels_))\n",
"print(\"Silhouette Coefficient: %0.3f\"\n",
" % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n",
"\n",
"print()\n",
"\n",
"\n",
"if not opts.use_hashing:\n",
" print(\"Top terms per cluster:\")\n",
"\n",
" if opts.n_components:\n",
" original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n",
" order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
" else:\n",
" order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
"\n",
" terms = vectorizer.get_feature_names()\n",
" for i in range(true_k):\n",
" print(\"Cluster %d:\" % i, end='')\n",
" for ind in order_centroids[i, :10]:\n",
" print(' %s' % terms[ind], end='')\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Referencias \n",
"\n",
"Pérez C. et al. (2016). Recuperación, procesamiento y clasificación de tuits para visualizar estructuras de interacción. Research in Computing Science Journal, 124 (1), 23-37. http://www.rcs.cic.ipn.mx/2016_124/Recuperacion_%20procesamiento%20y%20clasificacion%20de%20tuits%20para%20visualizar%20estructuras%20de%20interaccion.pdf\n",
"\n",
"\n",
"T. Joachims (1996). A probabilistic analysis of the Rocchio algorithm with TFIDF for text categorization, Computer Science Technical Report CMU-CS-96-118. Carnegie Mellon University.\n",
"http://rexa.info/paper/7c077ad01b1a7f0605ca075ead0193d4555c2619\n",
"\n",
"\n"
]
},
{
......@@ -46,7 +1348,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8rc1"
"version": "3.5.2"
}
},
"nbformat": 4,
......
[{
"titulo":"Fwd: Material de clase",
"remitente":"ghernandez@centrogeo.edu.mx",
"texto":"Hola Jóvenes. Les reenvío el correo de la Maestra Rosa Peralta para la actividad que tienen pendiente con ella. Aprovecho para agradecer el esfuerzo y el empaño para con estas sesiones de clase.",
"spam":"H"
},
{
"titulo":"carlosgarcia8, how to talk Twitter",
"remitente":"info@twitter.com",
"texto":"Connect with a Reply. These top videos on Twitter are the most played for a reason. Take a look, and then join the conversation about them by tapping Reply.",
"spam":"H"
},
{
"titulo":"Boletín CG Abril 2019",
"remitente":"glopez@centrogeo.edu.mx",
"texto":"El pasado 8 de marzo en el marco del día de la mujer CentroGeo fue sede del evento 'Mujeres en ciencia y tecnología, avances y retos de la inclusión y la equidad de género', organizado por el CIMAT, dentro del consorcio de Inteligencia Artificial, al que CentroGeo pertenece.",
"spam":"H"
},
{
"titulo":"Te envío información fiscal.",
"remitente":"fjhguno@hotmail.com",
"texto":" Buen día Sobrino y ahijado. En el archivo anexo encontraras un listado de las deducciones o gastos que son aceptados, para los que ganan por honorarios y que se utilizan en las declaraciones de pagos provisionales de cada mes. A continuación te enviare otro correo, con un desglose de las llamadas deducciones personales que aplican en las declaraciones anuales.",
"spam":"H"
},
{
"titulo":"Cursos Facultad de Ciencias, Educación Continua",
"remitente":"educontinua.of@ciencias.unam.mx",
"texto":"La Facultad de Ciencias de la Universidad Nacional Autónoma de México, a través de su Secretaría de Educación Abierta y Continua, ofrece lo siguiente: XIV Diplomado de Teledección, SIG y modelado aplicado a los recursos naturales. Curso Estadística aplicada al diseño y análisis de estudios experimentales. Curso Teórico- Práctico",
"spam":"H"
}
]
[{
"titulo":"¡ALEJANDRO FERNANDEZ EN EL DOMO CARE NO TE LO PUEDES PERDER!",
"remitente":"mkt@arenamonterrey.com",
"texto":"Para asegurar la entrega de nuestros e-mail en su correo, por favor agregue mkt@superboletos.com a su libreta de direcciones de correo.Si no visualiza bien este mensaje, haga clic aquí",
"spam":"S"
},
{
"titulo":"¡Todos los detalles sobre los horarios en los #20añosVL19! ",
"remitente":"contactoocesa@cie.com.mx",
"texto":"Recuerda incluir nuestro correo como destinatario seguro. Dudas o comentarios contacto@ocesa.com.mx.",
"spam":"S"
},
{
"titulo":"¿Maestría o Segunda Carrera? ¡Sube de nivel!",
"remitente":"envios@occmkt.com",
"texto":"La demanda de personal especializado ha incrementado en los últimos años. Perfiles de áreas como Administración, Derecho, Educación, Mercadotecnia e Ingenierías se han convertido en los más buscados por las empresas debido a las grandes aportaciones que sus habilidades y competencias realizan a las organizaciones.l crecimiento profesional es la llave para obtener mejores oportunidades laborales. Y ahora, gracias al aprendizaje en línea, puedes hacerlo sin descuidar lo más importante de tu vida.​​​​​​​Elige el área en la que te gustaría crecer y comienza o continúa tus estudios de licenciatura o maestría sin sacrificar tu tiempo de calidad con familia y amigos, sin descuidar tu empleo actual y sin privarte de tener tiempo para ti.",
"spam":"S"
},
{
"titulo":"Obtén tu Tarjeta y recibe un Monedero Electrónico Liverpool",
"remitente":"info@occpbl.com",
"texto":"Empieza una nueva aventura con $2000.00 M.N. en un monedero electrónico de Liverpool al adquirir la tarjeta. SOLICÍTALA AQUÍ. Aplica solo para nuevo tarjetahabientes al solicitarlo en línea.",
"spam":"S"
},
{
"titulo":"FW:",
"remitente":"Victoria_LHirondelle@sd33.bc.ca",
"texto":"Carta de Mavis Wanczyk sobre una donación. Por favor, abra adjunto para más detalles.",
"spam":"S"
}
]
[{
"titulo":"DataLab",
"remitente":"pablo.lopez@centrogeo.edu.mx",
"texto":"Hola! Pues eres parte del laboratorio si así lo quieres, justo uno de los principios fundamentales es que aquí todos estemos por gusto. Me gustaría muchísimo que trabajaras con nosotros y, en ese caso, ya tengo pensada tu primera tarea jajajaja. Sería ayudar a organizar el material de un curso de geoinformática, lo que hay que hacer es actualizar las prácticas y hacer una linda paginita para el curso. Si te animas platicamos en la semana.",
"spam": null
},
{
"titulo":"Uno mas",
"remitente":"pablo.lopez@centrogeo.edu.mx",
"texto":" Buen día Sobrino y ahijado. En el archivo anexo encontraras un listado de las deducciones o gastos que son aceptados, para los que ganan por honorarios y que se utilizan en las declaraciones de pagos provisionales de cada mes. A continuación te enviare otro correo, con un desglose de las llamadas deducciones personales que aplican en las declaraciones anuales.",
"spam": null
}
]
\ No newline at end of file
plot_prueba.png

162 KB | W: | H:

plot_prueba.png

507 KB | W: | H:

plot_prueba.png
plot_prueba.png
plot_prueba.png
plot_prueba.png
  • 2-up
  • Swipe
  • Onion skin
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment