Respuesta del primer ejercicio del segundo parcial.

71499d6f · Victor Hugo Pacheco Flores · 54d2d9d8 · 71499d6f
Commit 71499d6f authored Mar 23, 2019 by Victor Hugo Pacheco Flores
Hide whitespace changes
Inline Side-by-side

Showing with 394 additions and 0 deletions

Test2-1.ipynb Test2-1.ipynb +394 -0

No files found.
--- a/Test2-1.ipynb
+++ b/Test2-1.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hello</th>\n",
+       "      <th>omg</th>\n",
+       "      <th>pony</th>\n",
+       "      <th>she</th>\n",
+       "      <th>there</th>\n",
+       "      <th>went</th>\n",
+       "      <th>why</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1.00000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1.30103</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     hello  omg  pony  she  there  went  why\n",
+       "0  1.00000  0.0   0.0  0.0    1.0   0.0  1.0\n",
+       "1  1.30103  1.0   1.0  0.0    0.0   0.0  0.0\n",
+       "2  0.00000  1.0   0.0  1.0    1.0   1.0  0.0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import math\n",
+    "import pandas as pd \n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction.text import CountVectorizer \n",
+    "\n",
+    "class MatrizDT:\n",
+    "    \n",
+    "    def __init__(self,d):\n",
+    "        '''\n",
+    "        Este es un constructor que se encarga de inicializar un objeto de la clase MatrizDT. Dicha función se encarga de\n",
+    "        precargar todas las funciones mencionadas en la clase y asignarlas a variables de la clase. Esto se hizo de esta manera\n",
+    "        debido a que al momento de calcular la función tfidf(), seria necesario calcular las dos variables anteriores de nuevo\n",
+    "        si las funciones tuvieran el cálculo de cada variable. Entonces al llamar a la función tf() se hace un cálculo, si \n",
+    "        se llama a la función idf() se hace un segundo cálculo, pero al llamar a la tercera función tfidf() esta tendría que \n",
+    "        hacer el calculo de la primera función, el cálculo de la segunda y despues hacer el cálculo de la tercera, creando \n",
+    "        uso de memoria adicional en la tercera función.\n",
+    "        Al precargar todo desde el constructor, se cargan los tres cálculos desde el inicio, y al llamar las diferentes\n",
+    "        funciones no es necesario precargar los datos adicionales del objeto.\n",
+    "        \n",
+    "        Args:\n",
+    "        docs: un Array que contenga uno o más strings\n",
+    "        \n",
+    "        Ejemplo:\n",
+    "        >>>docs = ['why hello there', 'omg hello hello pony', 'she went there? omg']\n",
+    "        >>>c=MatrizDT(docs)\n",
+    "        '''\n",
+    "        self.documentos=d\n",
+    "        vec = CountVectorizer()\n",
+    "        x = vec.fit_transform(self.documentos)\n",
+    "        \n",
+    "        #primera función\n",
+    "        matriz=np.array(x.toarray(),dtype=float)\n",
+    "        for i in range(len(matriz)):\n",
+    "            for j in range(len(matriz[0])):\n",
+    "                if(matriz[i,j]!=0):\n",
+    "                    matriz[i][j]=1+math.log(matriz[i][j],10)\n",
+    "        self.vtf= pd.DataFrame(matriz, columns=vec.get_feature_names())\n",
+    "        \n",
+    "        #segunda función\n",
+    "        n=len(self.documentos)\n",
+    "        res=np.zeros((len(matriz),len(matriz[0])))\n",
+    "        for i in range(len(matriz[0])):\n",
+    "            df_t=0\n",
+    "            for j in range(len(matriz)):\n",
+    "                if(matriz[j][i] !=0):\n",
+    "                    df_t +=1\n",
+    "            val=math.log((n/df_t),10)\n",
+    "            for k in range(len(matriz)):\n",
+    "                res[k][i]=val\n",
+    "        self.vidf = pd.DataFrame(res, columns=vec.get_feature_names())\n",
+    "        \n",
+    "        #tercera función\n",
+    "        res2=np.zeros((len(matriz),len(matriz[0])))\n",
+    "        for i in range(len(matriz)):\n",
+    "            for j in range(len(matriz[0])):\n",
+    "                res2[i][j]=matriz[i][j]*res[i][j]\n",
+    "        self.vtfidf=pd.DataFrame(res2, columns=vec.get_feature_names())\n",
+    "                \n",
+    "    \n",
+    "    def tf(self):\n",
+    "        '''\n",
+    "        Calcula cada frecuencia de termino por la siguiente función: 1 + log count(tj,di) siempre y cuando count(tj,di) sea \n",
+    "        mayor a cero, en caso contrario solo deja el cero.\n",
+    "        \n",
+    "        Args: NA\n",
+    "        \n",
+    "        Ejemplo:\n",
+    "        >>>c.tf()\n",
+    "        \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
+    "        0\t1.00000\t0.0\t0.0\t0.0\t1.0\t0.0\t1.0\n",
+    "        1\t1.30103\t1.0\t1.0\t0.0\t0.0\t0.0\t0.0\n",
+    "        2\t0.00000\t1.0\t0.0\t1.0\t1.0\t1.0\t0.0\n",
+    "        '''\n",
+    "        return self.vtf\n",
+    "    \n",
+    "    def idf(self):\n",
+    "        '''\n",
+    "        Calcula la matriz donde cada celda tiene el valor de la frecuencia inversa del término: log(n/dft) donde n es el \n",
+    "        número total de documentos y dft es el número de textos en los cuales aparece el término t.\n",
+    "        \n",
+    "        Args: NA\n",
+    "        \n",
+    "        Ejemplo:\n",
+    "        >>>c.idf()\n",
+    "        \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
+    "        0\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
+    "        1\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
+    "        2\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
+    "        '''\n",
+    "        return self.vidf\n",
+    "    \n",
+    "    def tfidf(self):\n",
+    "        '''\n",
+    "        Calcula el producto de la frecuencia de término y de la frecuencia inversa del término. Osea el producto por elemento \n",
+    "        de las dos funciones anteriores.\n",
+    "        \n",
+    "        Args: NA\n",
+    "        \n",
+    "        Ejemplo:\n",
+    "        >>>c.tfidf()\n",
+    "        \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
+    "        0\t0.176091\t0.000000\t0.000000\t0.000000\t0.176091\t0.000000\t0.477121\n",
+    "        1\t0.229100\t0.176091\t0.477121\t0.000000\t0.000000\t0.000000\t0.000000\n",
+    "        2\t0.000000\t0.176091\t0.000000\t0.477121\t0.176091\t0.477121\t0.000000\n",
+    "        '''\n",
+    "        return self.vtfidf\n",
+    "        \n",
+    "        \n",
+    "docs = ['why hello there', 'omg hello hello pony', 'she went there? omg']\n",
+    "\n",
+    "c=MatrizDT(docs)\n",
+    "c.tf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hello</th>\n",
+       "      <th>omg</th>\n",
+       "      <th>pony</th>\n",
+       "      <th>she</th>\n",
+       "      <th>there</th>\n",
+       "      <th>went</th>\n",
+       "      <th>why</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.477121</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      hello       omg      pony       she     there      went       why\n",
+       "0  0.176091  0.176091  0.477121  0.477121  0.176091  0.477121  0.477121\n",
+       "1  0.176091  0.176091  0.477121  0.477121  0.176091  0.477121  0.477121\n",
+       "2  0.176091  0.176091  0.477121  0.477121  0.176091  0.477121  0.477121"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.idf()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hello</th>\n",
+       "      <th>omg</th>\n",
+       "      <th>pony</th>\n",
+       "      <th>she</th>\n",
+       "      <th>there</th>\n",
+       "      <th>went</th>\n",
+       "      <th>why</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.477121</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.229100</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.176091</td>\n",
+       "      <td>0.477121</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      hello       omg      pony       she     there      went       why\n",
+       "0  0.176091  0.000000  0.000000  0.000000  0.176091  0.000000  0.477121\n",
+       "1  0.229100  0.176091  0.477121  0.000000  0.000000  0.000000  0.000000\n",
+       "2  0.000000  0.176091  0.000000  0.477121  0.176091  0.477121  0.000000"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.tfidf()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}