Respuesta del primer ejercicio del segundo parcial.

parent 54d2d9d8
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hello</th>\n",
" <th>omg</th>\n",
" <th>pony</th>\n",
" <th>she</th>\n",
" <th>there</th>\n",
" <th>went</th>\n",
" <th>why</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.00000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.30103</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.00000</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" hello omg pony she there went why\n",
"0 1.00000 0.0 0.0 0.0 1.0 0.0 1.0\n",
"1 1.30103 1.0 1.0 0.0 0.0 0.0 0.0\n",
"2 0.00000 1.0 0.0 1.0 1.0 1.0 0.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import math\n",
"import pandas as pd \n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import CountVectorizer \n",
"\n",
"class MatrizDT:\n",
" \n",
" def __init__(self,d):\n",
" '''\n",
" Este es un constructor que se encarga de inicializar un objeto de la clase MatrizDT. Dicha función se encarga de\n",
" precargar todas las funciones mencionadas en la clase y asignarlas a variables de la clase. Esto se hizo de esta manera\n",
" debido a que al momento de calcular la función tfidf(), seria necesario calcular las dos variables anteriores de nuevo\n",
" si las funciones tuvieran el cálculo de cada variable. Entonces al llamar a la función tf() se hace un cálculo, si \n",
" se llama a la función idf() se hace un segundo cálculo, pero al llamar a la tercera función tfidf() esta tendría que \n",
" hacer el calculo de la primera función, el cálculo de la segunda y despues hacer el cálculo de la tercera, creando \n",
" uso de memoria adicional en la tercera función.\n",
" Al precargar todo desde el constructor, se cargan los tres cálculos desde el inicio, y al llamar las diferentes\n",
" funciones no es necesario precargar los datos adicionales del objeto.\n",
" \n",
" Args:\n",
" docs: un Array que contenga uno o más strings\n",
" \n",
" Ejemplo:\n",
" >>>docs = ['why hello there', 'omg hello hello pony', 'she went there? omg']\n",
" >>>c=MatrizDT(docs)\n",
" '''\n",
" self.documentos=d\n",
" vec = CountVectorizer()\n",
" x = vec.fit_transform(self.documentos)\n",
" \n",
" #primera función\n",
" matriz=np.array(x.toarray(),dtype=float)\n",
" for i in range(len(matriz)):\n",
" for j in range(len(matriz[0])):\n",
" if(matriz[i,j]!=0):\n",
" matriz[i][j]=1+math.log(matriz[i][j],10)\n",
" self.vtf= pd.DataFrame(matriz, columns=vec.get_feature_names())\n",
" \n",
" #segunda función\n",
" n=len(self.documentos)\n",
" res=np.zeros((len(matriz),len(matriz[0])))\n",
" for i in range(len(matriz[0])):\n",
" df_t=0\n",
" for j in range(len(matriz)):\n",
" if(matriz[j][i] !=0):\n",
" df_t +=1\n",
" val=math.log((n/df_t),10)\n",
" for k in range(len(matriz)):\n",
" res[k][i]=val\n",
" self.vidf = pd.DataFrame(res, columns=vec.get_feature_names())\n",
" \n",
" #tercera función\n",
" res2=np.zeros((len(matriz),len(matriz[0])))\n",
" for i in range(len(matriz)):\n",
" for j in range(len(matriz[0])):\n",
" res2[i][j]=matriz[i][j]*res[i][j]\n",
" self.vtfidf=pd.DataFrame(res2, columns=vec.get_feature_names())\n",
" \n",
" \n",
" def tf(self):\n",
" '''\n",
" Calcula cada frecuencia de termino por la siguiente función: 1 + log count(tj,di) siempre y cuando count(tj,di) sea \n",
" mayor a cero, en caso contrario solo deja el cero.\n",
" \n",
" Args: NA\n",
" \n",
" Ejemplo:\n",
" >>>c.tf()\n",
" \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
" 0\t1.00000\t0.0\t0.0\t0.0\t1.0\t0.0\t1.0\n",
" 1\t1.30103\t1.0\t1.0\t0.0\t0.0\t0.0\t0.0\n",
" 2\t0.00000\t1.0\t0.0\t1.0\t1.0\t1.0\t0.0\n",
" '''\n",
" return self.vtf\n",
" \n",
" def idf(self):\n",
" '''\n",
" Calcula la matriz donde cada celda tiene el valor de la frecuencia inversa del término: log(n/dft) donde n es el \n",
" número total de documentos y dft es el número de textos en los cuales aparece el término t.\n",
" \n",
" Args: NA\n",
" \n",
" Ejemplo:\n",
" >>>c.idf()\n",
" \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
" 0\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
" 1\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
" 2\t0.176091\t0.176091\t0.477121\t0.477121\t0.176091\t0.477121\t0.477121\n",
" '''\n",
" return self.vidf\n",
" \n",
" def tfidf(self):\n",
" '''\n",
" Calcula el producto de la frecuencia de término y de la frecuencia inversa del término. Osea el producto por elemento \n",
" de las dos funciones anteriores.\n",
" \n",
" Args: NA\n",
" \n",
" Ejemplo:\n",
" >>>c.tfidf()\n",
" \thello\tomg\tpony\tshe\tthere\twent\twhy\n",
" 0\t0.176091\t0.000000\t0.000000\t0.000000\t0.176091\t0.000000\t0.477121\n",
" 1\t0.229100\t0.176091\t0.477121\t0.000000\t0.000000\t0.000000\t0.000000\n",
" 2\t0.000000\t0.176091\t0.000000\t0.477121\t0.176091\t0.477121\t0.000000\n",
" '''\n",
" return self.vtfidf\n",
" \n",
" \n",
"docs = ['why hello there', 'omg hello hello pony', 'she went there? omg']\n",
"\n",
"c=MatrizDT(docs)\n",
"c.tf()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hello</th>\n",
" <th>omg</th>\n",
" <th>pony</th>\n",
" <th>she</th>\n",
" <th>there</th>\n",
" <th>went</th>\n",
" <th>why</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.176091</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.176091</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.176091</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.477121</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" hello omg pony she there went why\n",
"0 0.176091 0.176091 0.477121 0.477121 0.176091 0.477121 0.477121\n",
"1 0.176091 0.176091 0.477121 0.477121 0.176091 0.477121 0.477121\n",
"2 0.176091 0.176091 0.477121 0.477121 0.176091 0.477121 0.477121"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.idf()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hello</th>\n",
" <th>omg</th>\n",
" <th>pony</th>\n",
" <th>she</th>\n",
" <th>there</th>\n",
" <th>went</th>\n",
" <th>why</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.176091</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.176091</td>\n",
" <td>0.000000</td>\n",
" <td>0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.229100</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.000000</td>\n",
" <td>0.176091</td>\n",
" <td>0.000000</td>\n",
" <td>0.477121</td>\n",
" <td>0.176091</td>\n",
" <td>0.477121</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" hello omg pony she there went why\n",
"0 0.176091 0.000000 0.000000 0.000000 0.176091 0.000000 0.477121\n",
"1 0.229100 0.176091 0.477121 0.000000 0.000000 0.000000 0.000000\n",
"2 0.000000 0.176091 0.000000 0.477121 0.176091 0.477121 0.000000"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.tfidf()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment