Commit df27422b authored by geobumac's avatar geobumac
parents 07f2a28f 5abbbdef
...@@ -367,7 +367,9 @@ ...@@ -367,7 +367,9 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 56, "execution_count": 56,
"metadata": {}, "metadata": {
"scrolled": false
},
"outputs": [ "outputs": [
{ {
"data": { "data": {
...@@ -834,7 +836,7 @@ ...@@ -834,7 +836,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -843,13 +845,14 @@ ...@@ -843,13 +845,14 @@
"0.98" "0.98"
] ]
}, },
"execution_count": 1, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Cada estimador (clasificador) debe exponer un método \"score\" que nos indica la calidad de predicción.\n", "# Cada estimador (clasificador) debe exponer un método\n",
"# \"score\" que nos indica la calidad de predicción.\n",
"\n", "\n",
"from sklearn import datasets, svm\n", "from sklearn import datasets, svm\n",
"\n", "\n",
...@@ -857,7 +860,8 @@ ...@@ -857,7 +860,8 @@
"X_digits = digits.data\n", "X_digits = digits.data\n",
"y_digits = digits.target\n", "y_digits = digits.target\n",
"svc = svm.SVC(C=1, kernel='linear')\n", "svc = svm.SVC(C=1, kernel='linear')\n",
"svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])\n" "svc.fit(X_digits[:-100], y_digits[:-100])\n",
"svc.score(X_digits[-100:], y_digits[-100:])\n"
] ]
}, },
{ {
...@@ -874,7 +878,8 @@ ...@@ -874,7 +878,8 @@
} }
], ],
"source": [ "source": [
"# Para tener una mejor estimación del modelo, podemos dividir los datos que usamos para entrenamiento y test.\n", "# Para tener una mejor estimación del modelo,\n",
"# podemos dividir los datos que usamos para entrenamiento y test.\n",
"# A esto se le llama validación cruzada\n", "# A esto se le llama validación cruzada\n",
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
......
...@@ -57,6 +57,327 @@ ...@@ -57,6 +57,327 @@
"\n" "\n"
] ]
}, },
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"import math\n",
"\n",
"class MatrizDT:\n",
" def __init__(self, listaDocs):\n",
" \"\"\"Constructor de la clase\n",
"\n",
" Args:\n",
" listaDocs [(string)]: lista de documentos (Textos a interpretar)\n",
" \"\"\"\n",
" self.listaDocs = listaDocs\n",
" self.cleanDocs = self.getArrayCleanDocs()\n",
" self.totalDocs = len(listaDocs)\n",
" self.terminos = self.getTerminos()\n",
" self.listNameDoc = self.getListNameDoc()\n",
" \n",
" def removerpunt(self, s):\n",
" \"\"\"Funcion que elimina caracteres no alpaheticos\n",
"\n",
" Args:\n",
" s (string): palabra a la cual se eliminaran los caracteres\n",
"\n",
" Examples:\n",
" >>> self.removerpunt(\"este es un texto con texto que es repetido y repetido, \")\n",
" este es un texto con texto que es repetido y repetido\n",
" \"\"\"\n",
" s = re.sub(r\"[,|\\.|:|?|=|​]\", \"\", s)\n",
" s = re.sub(r\"[^\\w\\s]\", '', s)\n",
" return s\n",
" \n",
" def getListNameDoc(self):\n",
" \"\"\"Funcion que devuelve un listado de nombres de documentos tomando de referencia la cantidad de documentos\n",
" \n",
" Examples:\n",
" >>> self.getListNameDoc()\n",
" [\"Documento1\", \"Documento2\", ..., \"DocumentoN\"]\n",
" \"\"\"\n",
" aux = []\n",
" iCont = 1\n",
" for i in self.listaDocs:\n",
" aux.append(\"Documento\" + str(iCont))\n",
" iCont += 1\n",
" return aux\n",
"\n",
" def getArrayCleanDocs(self):\n",
" \"\"\"Funcion que devueleve el arreglo de documentos al cual se le han eliminado los caracteres no deseados\n",
" \"\"\"\n",
" aux = []\n",
" for i in self.listaDocs:\n",
" texto = self.removerpunt(i)\n",
" aText = texto.split(\" \")\n",
" aux.append(aText)\n",
" return np.array(aux)\n",
" \n",
" def getTerminos(self):\n",
" \"\"\"Funcion que devueleve el liestado completo de terminos existentes en los documentos\n",
" \n",
" Examples:\n",
" >>> self.getTerminos()\n",
" [\"este\", \"es\", \"un\", \"ejemplo\"]\n",
" \"\"\"\n",
" listaDocs = self.listaDocs\n",
" texto = ' '.join(listaDocs)\n",
" texto = texto.lower()\n",
" arrayTexto = texto.split(\" \")\n",
" return np.array(list(set(arrayTexto)))\n",
" \n",
" def tf(self):\n",
" \"\"\"Funcion que devueleve una matriz documento-termino en la cual cada celda tiene el producto de la frecuencia de término y de la frecuencia inversa del término\n",
" \n",
" Examples:\n",
" >>> self.tf()\n",
" array([[0., 0., 1., 0., 1.69314718, 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0.],\n",
" [1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.],\n",
" [0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.]])\n",
" \"\"\"\n",
" aux = []\n",
" iCont = 1\n",
" for iDoc in self.cleanDocs:\n",
" aux.append([(1 + math.log(iDoc.count(i))) if iDoc.count(i) > 0 else 0 for i in self.terminos])\n",
" iCont += 1\n",
" return np.array(aux)\n",
" \n",
" def idf(self):\n",
" \"\"\"Funcion que devueleve una matriz documento-termino en la cual cada celda es la frecuencia inversa del termino\n",
" \n",
" Examples:\n",
" >>> self.idf()\n",
" array([[3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ],\n",
" [3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ],\n",
" [3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ]])\n",
" \"\"\"\n",
" aux = []\n",
" data = self.tf()\n",
" #print(self.tf())\n",
" for iDoc in self.cleanDocs:\n",
" aux1 = []\n",
" for i in range(len(self.terminos)):\n",
" contUnos = 0\n",
" for j in data[:,i]:\n",
" if j > 0:\n",
" contUnos += 1;\n",
" valor = 0\n",
" if contUnos > 0:\n",
" valor = self.totalDocs / contUnos\n",
" aux1.append(valor)\n",
" aux.append(aux1)\n",
" return np.array(aux)\n",
" \n",
" def tfidf(self, tf, idf):\n",
" \"\"\"Funcion que devueleve una matriz documento-termino en la cual cada celda es la frecuencia inversa del termino\n",
" \n",
" Args:\n",
" tf (array): arreglo con las frecuencas del termino\n",
" idf (array): arreglo con las frecuencias inversas del termino\n",
" \n",
" Examples:\n",
" >>> self.tfidf()\n",
" array([[3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ],\n",
" [3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ],\n",
" [3. , 3. , 3. , 3. , 1. , 1. , 3. , 0. , 3. , 3. , 3. , 3. , 0. , 1.5, 3. , 3. ]])\n",
" \"\"\"\n",
" return np.multiply(tf, idf)\n",
" \n",
" \n",
" def getDF(self, data):\n",
" \"\"\"Funcion que devuelve un dataframe para su impresion en el notebook\n",
" \n",
" Args:\n",
" tf (array): arreglo con la matriz documento-termino a imprimir\n",
" \n",
" Examples:\n",
" >>> self.tfidf()\n",
" \"\"\"\n",
" df = pd.DataFrame(data, index=self.listNameDoc, columns=self.terminos)\n",
" return df\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>es</th>\n",
" <th>en</th>\n",
" <th>mayor</th>\n",
" <th>humanos</th>\n",
" <th>los</th>\n",
" <th>pública</th>\n",
" <th>el</th>\n",
" <th>cantados</th>\n",
" <th>marco</th>\n",
" <th>dentro</th>\n",
" <th>...</th>\n",
" <th>clases</th>\n",
" <th>narrar</th>\n",
" <th>acompañamiento</th>\n",
" <th>unos</th>\n",
" <th>sanitario</th>\n",
" <th>estado</th>\n",
" <th>poesía</th>\n",
" <th>siendo</th>\n",
" <th>variables</th>\n",
" <th>variable</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Documento1</th>\n",
" <td>1.0</td>\n",
" <td>1.693147</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.693147</td>\n",
" <td>0.0</td>\n",
" <td>1.693147</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Documento2</th>\n",
" <td>1.0</td>\n",
" <td>2.386294</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.693147</td>\n",
" <td>1.0</td>\n",
" <td>2.945910</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.098612</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Documento3</th>\n",
" <td>0.0</td>\n",
" <td>2.791759</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.791759</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 277 columns</p>\n",
"</div>"
],
"text/plain": [
" es en mayor humanos los pública el \\\n",
"Documento1 1.0 1.693147 0.0 0.0 1.693147 0.0 1.693147 \n",
"Documento2 1.0 2.386294 0.0 1.0 1.693147 1.0 2.945910 \n",
"Documento3 0.0 2.791759 1.0 0.0 2.791759 0.0 1.000000 \n",
"\n",
" cantados marco dentro ... clases narrar acompañamiento \\\n",
"Documento1 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
"Documento2 0.0 1.0 1.0 ... 1.0 0.0 0.0 \n",
"Documento3 1.0 0.0 0.0 ... 0.0 1.0 1.0 \n",
"\n",
" unos sanitario estado poesía siendo variables variable \n",
"Documento1 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 \n",
"Documento2 0.0 2.098612 0.0 0.0 1.0 1.0 1.0 \n",
"Documento3 1.0 0.000000 0.0 1.0 0.0 0.0 0.0 \n",
"\n",
"[3 rows x 277 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"listaDocs = [\n",
" u'''La Informática es la disciplina o campo de estudio que abarca el conjunto de conocimientos, métodos y técnicas referentes al tratamiento automático de la información, junto con sus teorías y aplicaciones prácticas, con el fin de almacenar, procesar y transmitir datos e información en formato digital utilizando sistemas computacionales. Los datos son la materia prima para que, mediante su proceso, se obtenga como resultado información. Para ello, la informática crea y/o emplea sistemas de procesamiento de datos, que incluyen medios físicos (hardware) en interacción con medios lógicos (software) y las personas que los programan y/o los usan (humanware)''',\n",
" u'''La práctica de la medicina se ejerce dentro del marco económico, legal y oficial del sistema médico que es parte de los sistemas nacionales de salud pública (políticas sanitarias estatales). Las características bajo las cuales se maneja el sistema sanitario en general y el órgano médico en particular ejercen un efecto significativo sobre cómo el servicio de salud, y la atención sanitaria puede ser aprovechada por la población general. Una de las variables más importantes para el funcionamiento del sistema se corresponde con el área financiera y el presupuesto que un Estado invierte en materia de salud. Otra variable implica los recursos humanos que articulan las directivas del sistema sanitario. La otra cara de la moneda en materia de atención médica está dada por el servicio privado de salud. Los honorarios y costos del servicio sanitario corren por cuenta del contratista, siendo de esta forma un servicio generalmente restringido a las clases económicamente solventes. Existen no obstante contratos de seguro médico que permiten acceder a estos servicios sanitarios privados; son, fundamentalmente, de dos tipos:''',\n",
" u'''Hay testimonios de lenguaje escrito en forma de poesía en jeroglíficos egipcios de 25 siglos antes de Cristo. Se trata de cantos de labor y religiosos. El Poema de Gilgamesh, obra épica de los sumerios, fue escrito con caracteres cuneiformes y sobre tablas de arcilla unos 2000 años antes de Cristo. Los cantos de la Ilíada y la Odisea, cuya composición se atribuye a Homero, datan de ocho siglos antes de la era cristiana. Los Veda, libros sagrados del hinduismo, también contienen himnos y su última versión se calcula fue redactada en el siglo III a. C. Por estos y otros textos antiguos se supone justificadamente que los pueblos componían cantos que eran trasmitidos oralmente. Algunos acompañaban los trabajos, otros eran para invocar a las divinidades o celebrarlas y otros para narrar los hechos heroicos de la comunidad. Los cantos homéricos hablan de episodios muy anteriores a Homero y su estructura permite deducir que circulaban de boca en boca y que eran cantados con acompañamiento de instrumentos musicales. Homero menciona en su obra la figura del aedo (cantor), que narraba sucesos en verso al compás de la lira. El ritmo de los cantos no solo tenía la finalidad de agradar al oído, sino que permitía recordar los textos con mayor facilidad.'''\n",
"]\n",
"\n",
"\n",
"\n",
"mdt = MatrizDT(listaDocs)\n",
"tf = mdt.tf()\n",
"idf = mdt.idf()\n",
"tfidf = mdt.tfidf(tf, idf)\n",
"mdt.getDF(tf)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
...@@ -135,6 +456,696 @@ ...@@ -135,6 +456,696 @@
"* http://mathworld.wolfram.com/ZipfsLaw.html" "* http://mathworld.wolfram.com/ZipfsLaw.html"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"import json\n",
"import string\n",
"import re\n",
"import pandas as pd\n",
"\n",
"path = '/Users/amolina/repo/GNER/'\n",
"file = 'named_entity_recognition_sp_MX_locations.JSON'\n",
"\n",
"MXloc = json.load(open(path + file))\n",
"texto=''\n",
"\n",
"def cuenta_palabras(texto):\n",
" '''La función recibe un texto y regresa una lista ordenada con el numero de occurencias de cada palabra en el texto\n",
" \n",
" Args:\n",
" texto(str): La cadena de texto a evaluar\n",
" \n",
" Ejemplo:\n",
" >>> frase = 'zayra prueba el ejercicio prueba un perfume prueba el pastel'\n",
" >>> cuenta_palabras(frase)\n",
" [('ejercicio', 1), ('el', 2), ('pastel', 1), ('perfume', 1), ('prueba', 3), ('un', 1), ('zayra', 1)]\n",
" '''\n",
" cuenta = dict()\n",
" for palabra in texto.split():\n",
" if palabra in cuenta: \n",
" cuenta[palabra] += 1\n",
" else: \n",
" cuenta[palabra] = 1\n",
" \n",
" return sorted(cuenta.items(), key=lambda x: x[1], reverse=True)\n",
"\n",
"\n",
"def removerpunt(s):\n",
" s = re.sub(r\"[,|\\.|:|?|=|​|\\n|<|>]\", \"\", s)\n",
" s = re.sub(\"START|END|location\", \"\", s)\n",
" return s\n",
"\n",
"for i in range(len(MXloc)):\n",
" doc_loc = removerpunt(MXloc[i]['doc_locations']) \n",
" texto += ' ' + doc_loc.lower()\n",
"\n",
"espanol = cuenta_palabras(texto)\n",
"pal_esp = list(zip(*espanol))[0]\n",
"frec_esp = list(zip(*espanol))[1]\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"R=len(frec_esp)\n",
"lnfreq = []\n",
"lnrank = []\n",
"\n",
"\n",
"for k in range(1,R):\n",
" lnfreq.append(np.log(frec_esp[k-1]))\n",
" lnrank.append(np.log(k))\n",
"\n",
"datos_acotados = [(espanol[count]) for count in range(50)]\n",
"datos_acotados = pd.DataFrame(datos_acotados, columns=['PALABRA','FRECUENCIA'])\n",
"datos_acotados.insert(0,'RANK', pd.Series(np.arange(1,111)))\n",
"datos_acotados.insert(3,'ln (RANK)', [lnrank[lr] for lr in range(50)])\n",
"datos_acotados.insert(4,'ln (FRECUENCIA)', [lnfreq[lf] for lf in range(50)])\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TABLA CON DATOS SOBRE LAS 10 PALABRAS MAS FRECUENTES ENCONTRADAS EN EL ARCHIVO .JSON\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>RANK</th>\n",
" <th>PALABRA</th>\n",
" <th>FRECUENCIA</th>\n",
" <th>ln (RANK)</th>\n",
" <th>ln (FRECUENCIA)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>de</td>\n",
" <td>26928</td>\n",
" <td>0.000000</td>\n",
" <td>10.200922</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>la</td>\n",
" <td>14314</td>\n",
" <td>0.693147</td>\n",
" <td>9.568993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>en</td>\n",
" <td>10377</td>\n",
" <td>1.098612</td>\n",
" <td>9.247347</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>que</td>\n",
" <td>9921</td>\n",
" <td>1.386294</td>\n",
" <td>9.202409</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>el</td>\n",
" <td>9529</td>\n",
" <td>1.609438</td>\n",
" <td>9.162095</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>y</td>\n",
" <td>9162</td>\n",
" <td>1.791759</td>\n",
" <td>9.122820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>a</td>\n",
" <td>7493</td>\n",
" <td>1.945910</td>\n",
" <td>8.921725</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>los</td>\n",
" <td>5750</td>\n",
" <td>2.079442</td>\n",
" <td>8.656955</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>del</td>\n",
" <td>4850</td>\n",
" <td>2.197225</td>\n",
" <td>8.486734</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>se</td>\n",
" <td>4773</td>\n",
" <td>2.302585</td>\n",
" <td>8.470730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>por</td>\n",
" <td>3715</td>\n",
" <td>2.397895</td>\n",
" <td>8.220134</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>las</td>\n",
" <td>3636</td>\n",
" <td>2.484907</td>\n",
" <td>8.198639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>un</td>\n",
" <td>3473</td>\n",
" <td>2.564949</td>\n",
" <td>8.152774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>con</td>\n",
" <td>3410</td>\n",
" <td>2.639057</td>\n",
" <td>8.134468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>para</td>\n",
" <td>2591</td>\n",
" <td>2.708050</td>\n",
" <td>7.859799</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>una</td>\n",
" <td>2520</td>\n",
" <td>2.772589</td>\n",
" <td>7.832014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>al</td>\n",
" <td>2393</td>\n",
" <td>2.833213</td>\n",
" <td>7.780303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>su</td>\n",
" <td>1942</td>\n",
" <td>2.890372</td>\n",
" <td>7.571474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>no</td>\n",
" <td>1785</td>\n",
" <td>2.944439</td>\n",
" <td>7.487174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>es</td>\n",
" <td>1436</td>\n",
" <td>2.995732</td>\n",
" <td>7.269617</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>21</td>\n",
" <td>como</td>\n",
" <td>1399</td>\n",
" <td>3.044522</td>\n",
" <td>7.243513</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>22</td>\n",
" <td>lo</td>\n",
" <td>1286</td>\n",
" <td>3.091042</td>\n",
" <td>7.159292</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>23</td>\n",
" <td>fue</td>\n",
" <td>1038</td>\n",
" <td>3.135494</td>\n",
" <td>6.945051</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>24</td>\n",
" <td>más</td>\n",
" <td>1037</td>\n",
" <td>3.178054</td>\n",
" <td>6.944087</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>25</td>\n",
" <td>sus</td>\n",
" <td>781</td>\n",
" <td>3.218876</td>\n",
" <td>6.660575</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>26</td>\n",
" <td>este</td>\n",
" <td>766</td>\n",
" <td>3.258097</td>\n",
" <td>6.641182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>27</td>\n",
" <td>donde</td>\n",
" <td>709</td>\n",
" <td>3.295837</td>\n",
" <td>6.563856</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>28</td>\n",
" <td>seguridad</td>\n",
" <td>672</td>\n",
" <td>3.332205</td>\n",
" <td>6.510258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>29</td>\n",
" <td>dos</td>\n",
" <td>669</td>\n",
" <td>3.367296</td>\n",
" <td>6.505784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>30</td>\n",
" <td>mil</td>\n",
" <td>649</td>\n",
" <td>3.401197</td>\n",
" <td>6.475433</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>31</td>\n",
" <td>entre</td>\n",
" <td>627</td>\n",
" <td>3.433987</td>\n",
" <td>6.440947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>32</td>\n",
" <td>años</td>\n",
" <td>607</td>\n",
" <td>3.465736</td>\n",
" <td>6.408529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>33</td>\n",
" <td>fueron</td>\n",
" <td>595</td>\n",
" <td>3.496508</td>\n",
" <td>6.388561</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>34</td>\n",
" <td>pero</td>\n",
" <td>582</td>\n",
" <td>3.526361</td>\n",
" <td>6.366470</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>35</td>\n",
" <td>le</td>\n",
" <td>556</td>\n",
" <td>3.555348</td>\n",
" <td>6.320768</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>36</td>\n",
" <td>méxico</td>\n",
" <td>550</td>\n",
" <td>3.583519</td>\n",
" <td>6.309918</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>37</td>\n",
" <td>ha</td>\n",
" <td>549</td>\n",
" <td>3.610918</td>\n",
" <td>6.308098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>38</td>\n",
" <td>esta</td>\n",
" <td>543</td>\n",
" <td>3.637586</td>\n",
" <td>6.297109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>39</td>\n",
" <td>estado</td>\n",
" <td>527</td>\n",
" <td>3.663562</td>\n",
" <td>6.267201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>40</td>\n",
" <td>cuando</td>\n",
" <td>515</td>\n",
" <td>3.688879</td>\n",
" <td>6.244167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>41</td>\n",
" <td>ya</td>\n",
" <td>512</td>\n",
" <td>3.713572</td>\n",
" <td>6.238325</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>42</td>\n",
" <td>o</td>\n",
" <td>510</td>\n",
" <td>3.737670</td>\n",
" <td>6.234411</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>43</td>\n",
" <td>policía</td>\n",
" <td>503</td>\n",
" <td>3.761200</td>\n",
" <td>6.220590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>44</td>\n",
" <td>quien</td>\n",
" <td>468</td>\n",
" <td>3.784190</td>\n",
" <td>6.148468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>45</td>\n",
" <td>sobre</td>\n",
" <td>454</td>\n",
" <td>3.806662</td>\n",
" <td>6.118097</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>46</td>\n",
" <td>personas</td>\n",
" <td>438</td>\n",
" <td>3.828641</td>\n",
" <td>6.082219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>47</td>\n",
" <td>sin</td>\n",
" <td>435</td>\n",
" <td>3.850148</td>\n",
" <td>6.075346</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>48</td>\n",
" <td>también</td>\n",
" <td>434</td>\n",
" <td>3.871201</td>\n",
" <td>6.073045</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>49</td>\n",
" <td>gobierno</td>\n",
" <td>423</td>\n",
" <td>3.891820</td>\n",
" <td>6.047372</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>50</td>\n",
" <td>ciudad</td>\n",
" <td>420</td>\n",
" <td>3.912023</td>\n",
" <td>6.040255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" RANK PALABRA FRECUENCIA ln (RANK) ln (FRECUENCIA)\n",
"0 1 de 26928 0.000000 10.200922\n",
"1 2 la 14314 0.693147 9.568993\n",
"2 3 en 10377 1.098612 9.247347\n",
"3 4 que 9921 1.386294 9.202409\n",
"4 5 el 9529 1.609438 9.162095\n",
"5 6 y 9162 1.791759 9.122820\n",
"6 7 a 7493 1.945910 8.921725\n",
"7 8 los 5750 2.079442 8.656955\n",
"8 9 del 4850 2.197225 8.486734\n",
"9 10 se 4773 2.302585 8.470730\n",
"10 11 por 3715 2.397895 8.220134\n",
"11 12 las 3636 2.484907 8.198639\n",
"12 13 un 3473 2.564949 8.152774\n",
"13 14 con 3410 2.639057 8.134468\n",
"14 15 para 2591 2.708050 7.859799\n",
"15 16 una 2520 2.772589 7.832014\n",
"16 17 al 2393 2.833213 7.780303\n",
"17 18 su 1942 2.890372 7.571474\n",
"18 19 no 1785 2.944439 7.487174\n",
"19 20 es 1436 2.995732 7.269617\n",
"20 21 como 1399 3.044522 7.243513\n",
"21 22 lo 1286 3.091042 7.159292\n",
"22 23 fue 1038 3.135494 6.945051\n",
"23 24 más 1037 3.178054 6.944087\n",
"24 25 sus 781 3.218876 6.660575\n",
"25 26 este 766 3.258097 6.641182\n",
"26 27 donde 709 3.295837 6.563856\n",
"27 28 seguridad 672 3.332205 6.510258\n",
"28 29 dos 669 3.367296 6.505784\n",
"29 30 mil 649 3.401197 6.475433\n",
"30 31 entre 627 3.433987 6.440947\n",
"31 32 años 607 3.465736 6.408529\n",
"32 33 fueron 595 3.496508 6.388561\n",
"33 34 pero 582 3.526361 6.366470\n",
"34 35 le 556 3.555348 6.320768\n",
"35 36 méxico 550 3.583519 6.309918\n",
"36 37 ha 549 3.610918 6.308098\n",
"37 38 esta 543 3.637586 6.297109\n",
"38 39 estado 527 3.663562 6.267201\n",
"39 40 cuando 515 3.688879 6.244167\n",
"40 41 ya 512 3.713572 6.238325\n",
"41 42 o 510 3.737670 6.234411\n",
"42 43 policía 503 3.761200 6.220590\n",
"43 44 quien 468 3.784190 6.148468\n",
"44 45 sobre 454 3.806662 6.118097\n",
"45 46 personas 438 3.828641 6.082219\n",
"46 47 sin 435 3.850148 6.075346\n",
"47 48 también 434 3.871201 6.073045\n",
"48 49 gobierno 423 3.891820 6.047372\n",
"49 50 ciudad 420 3.912023 6.040255"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('TABLA CON DATOS SOBRE LAS 10 PALABRAS MAS FRECUENTES ENCONTRADAS EN EL ARCHIVO .JSON')\n",
"datos_acotados"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Se puede observar que ambas graficas son muy parecidas, la grafica de la función de densidad de probabilidad no \n",
"tiene un comportamiento 'normal', esto debido a que la frecuencia va decreciendo en una proporcion inversa a \n",
"su lugar en el ranking, por ejemplo la frecuencia de palabra 'la' es 1/2 de la primera palabra 'de', \n",
"como se puede observar en la tabla de rank vs freq, y por ende las probabilidades van decrementando igual \n",
"forma, hasta llegar a practiamente cero en las ultimas palabras del ranking\n",
"\n",
"De forma experimental y probando varias veces, se hizo una variación del coeficiente 1.78*R de tal forma que\n",
"arrojara una suma de probabilidades cercana uno (1). Se encontró que el coeficiente original tambien funciona \n",
"para el idioma español, sin embargo si se quisisran tomar más decimales, se encontro una aproximación \n",
"de 1.78104 con el que la suma de p(k) dio un poco más cercana a 1.\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA3gAAAFNCAYAAABSRs15AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3XecFPX9x/HX5+5o0hUU6SAWigiCgMaCiRosKDEaC1FR0BjBmPhDxV4SS0Rjr6CisXexGzWKQbqigiJNVEB6R5rw+f3xnYNlubIHtzu3d+/n47GPu52Znfns7O7MfObbzN0RERERERGR7JcTdwAiIiIiIiJSOpTgiYiIiIiIlBNK8ERERERERMoJJXgiIiIiIiLlhBI8ERERERGRckIJnoiIiIiISDmhBG87mNksMztiB17/jJn1Ks2YitiWm1mrAqZXMbMpZlY/zdtvHsWQt52vr2JmX5vZ7qUdW1ljZpPNrHsKy5X4szOz7mY2e4cC3A5mdp2ZPZnissPM7B/pjqk8MrNDzGyRmZ1mZg+a2V4pvk77XLKSmfU2s/cytK0Cz6NlmZm9bWZnFTJvR8/LKe+PkpwD0mlH4jCzPmb2vyLmb97Xycua2Soza7k96y0mpmLP6Zm61jSz9mb26Q6uo0x8T8oTJXgZZmbtgf2A16LnfcxsY3QQWGFmX5jZcemOw93XAY8Cg9K9rR10HjDC3X+CzRek66P9lf84JeYYS4W7t3X3j1JYLls+O8mcQ4DjgSOBXYFp8YYjFU1043NN0rG5Ybq25+5PuftR6Vp/aUg6v68ys+/M7LFUb8DsCHc/2t0fT/d20inpOzU/Ov/XiDuuZEXta3ev4e4zMx1TCteaE/OvNaNkcVPC93S2mT1vZgckrdPNbHXCcsui9/glsMzMehYSS66ZjTWzK5OmjTOzgWnaBUVKes8rzexbMzs7mrfVDRAL7olurDeKI97toQQv8/4EPOVbjzA/yt1rAHWA+4FnzaxOBmJ5GjjLzKpkYFvb63zg30nTbo0OmvmP55JftL13JrNINnx2aVMBPt8Scfeb3P1Tdz/H3U9MOr6IZErPpGPz3LgDKgPyz++1gSOANcAEM2sXb1hZo2e0//YHOgNXJS8QXYDrenZrxV1rPgI8b2Z1o3lzo3k1gW7AFOATM/tN0nr3S/h9J16nPhVtcxvuvhE4B7jMzPaJJg8EHLhj+9/iDst/z7WAy4AhZtYmcYHoe/UQ0B04zN3nZDzK7aQfxA6KipWfN7MnorsAk82scxEvORr4uKAZ7r6JkMxUB/ZM2MYLZjbPzJab2Qgza5swb5iZ3Wdmb0bbH2NmexQS68Fm9qNF1QDdfTawlPBjTl62YXTnbOeEaR0tVAOrZGatzOzjKKZFZrZNklVIDB+Z2d/NbGQU73tmVq+QZZsCLYExKa57lpldZmZfAqvNLC96Hy+Z2cLo7ulfEpbPNbMrzGxGFMsEM2uSfPcmIe5+Cc/PMbNvzGypmb1rZs0S5rmZnW9m08xsWfT5WML8c6PXrrRQ/XT/hPiPiP7vYmajotf/ZGb3mlnl/HUU9dlFr68WfTeWmtnXQPKduEL3SwHrGmah2t9/opg/Tnq/d0XfqxXRPjykiHUV+l2O1CtiO25m/c1sGlEJVVHbjvbh+GjefDP7VxFxHWfhjuYyM/vUwt3P/HmzzGygmX0Zxf2cmVUtYl0FfjcsuMPMFkQxfWXRRd6O7GMr5hhkZq2j7++yaN7xhcUuUtqsgKpkSce64r6/Tczs5ehYtdjM7o2mJ1eFO8hCicDy6O9BCfNSPu9Ey18SHXfnmtk5SfOqmNltZvZDdFx50MyqFbcf3H2ju89w9wsI1wDXJayzW3TcWWahFk/3VGI3s6pm9mS0X5ZF73u3hNf1i/7PjWJeZGYzgWOT3tPZtuWcNNPM/pQ0v9D9UcC+axEdv1aa2X+AeknzC32vxey/OcDbQP4x8yMzu9HMRgI/Ay0tnNeGm9kSM5tuZucmraaqheP3SjP7zMz2S4hrkG25FvjazH637Vuze6Pv1xRLSHws6fog6UWbq7Oa2S5RfCvMbCywR9KyRR3nizynF6C4a81HgWrJMXgw292vAYYC/yxmO/k+An5jhdx0dvdJwL+AoWbWGrgC6Bslf1uxcI35jIVrlMpJ87pauIbITZj2OwvXfSU67ye951cJ11SJCV4u8BjhxkJ3d59f3LrKEiV4peN44FnCXZHhwL0FLWRm1YEWwLeFzM8FzgY2AN8nzHqbkPDtCnxGuFOS6FTgeqAuMB24sYB19wCeAX6fVA3wG0Ix/laiO6+jgN8nTD4deNHdNwB/B96LttkYuKeg91SI0wnvc1egMuFOTkH2BWa6+y8lWPdphJNXHWAT8DrwBdAI+A3wVzP7bbTsxdHyxxDu4JxDOFEUycxOIBycTgTqA58Q9m2i4wgH4PbAH4DfRq89mXBiPzPa5vHA4gI2sxH4G+HkeGAU+wVJyxT42UWuJRy494i2vbkthoU7UkXtl4L0Jnzm9YCJbP0dHAd0AHYmlCy+YIUnQMV9l4vaDkAvoCtbDsJFbfsu4C53rxXth+cLCsjMOhJOdn8CdiHcrRuedKL6A9CD8PttD/QpZF1FfTeOAg4F9iLczf8DW3/2O7KPCzwGmVklwmf9HmGfXwg8ZWZ7FxS/SEwK+/7mAm8QzofNCcerZ5NfbOFG5JvA3YTf8L+AN81sl4TFUjrvROfKgYTqznsSSt0S3UL4DXcAWkUxXVOyt8vLhGrVWKjy9SbwD8LveyDwkm3dxrqw2M8iHEuaEN73+YQSwmTnEs5JHQkXqyclzV8Qza8VbecO23Ljsbj9kexpYALhOPZ3tj73pPJeC2RmTQjn6s8TJp9BaMZRk/AdeRaYDTSM3uNNZvbrhOVPAF5gy3H01egYCTCD8JnUJlxPPWlbt/3vGi1Tj3B+fdkSboCn6D5gLbA74XojOVku6jhf6Dk9WQrXmnlAP2AVRVfnfxnYP1pfkaIEfANQ1LnlJsL+/QS4092/KiC2asCrwDrgD+6+Pmk7Y4DVQOLnejphf0GK5/2kbeZECX0dIDGmp6L382t3L+g6rWxzdz1K+ABmAUdE/18HvJ8wrw2wppDXNSIUSVdNmNYH+AVYRvhxrCF8qQvbdp1oHbWj58OAoQnzjwGmJDx34HLCwa9dAet7CrimkG31Az6M/jfgR+DQ6PkTwMNA42L2VfMohrzo+UfAVQnzLwDeKeS1vYHRSdOGEQ6Qy6LHoqTP5ZyE512BH5JefznwWPT/t8AJxcWcEHe/6P+3CXee8uflEBLDZgn7/OCE+c8Dg6L/3wUuKu57VcC8vwKvlOCzmwn0SHh+HjA7lf1SwLqGAc8mPK9BSECbFLL8UkI1Dgi/jydL8F0udDvRsr8u5vuWuO0RhBN1vWJe8wDw96Rp3xKqY+R/Ln9MmHcr8GAh6yr0u0E4IU0llLrmlPI+LvAYRLhgmZe4PULCeV3Cdv9R1P7RQ49UHtHvZBVbjs2vRtO75x97kpYt9hxKuLm1kIRjccJyfYD/Rf+fAYxNmj8K6BP9/xGpn3ceBW5JeL5XdOxpRTgPrgb2SJh/IPBdIevaHGPS9B7Ahuj/y4B/J81/FziruNgJCcKnQPsCtvERW85ZHwLnJ8w7iqRzXNJrXyU6TxW1Pwp4XVPC9Uz1hGlPE50DinuvxXynvic0YamW8P5uSFi2CeGYWTNh2s3AsITv2eiEeTnAT8AhhWx7ItH1QfQ5zgUsYf5Y4IwC9vVWn3nCdyeXcI23T8K8mwr6fiTMTzzOF3pOL+B1xV1rLgJGs+U32L2gdQH7ROtplPBeVrDlN3530vJziK4Ri3hPQ6P17JU0/TrCzZ2PCTdqrIh1/AN4NPq/JuE32Sx6nup5vzuhEGAZsCT6vE+N5jVPeK//V9R6yvJDJXilY17C/z8TqgEU1EZoWfS3ZtL00R7qMtclfMETi+VzzeyWqOrACsIBD7au9pC8/eRGyH8FnvdQRJ6sZkJcyV4CDozuYh1K+DF8Es27lHCyG2uhSk2R1TaSFBdvvqVsu68AbnP3OtEjuZrNjwn/NwMaRlVBllloEHwFsFs0vwnhjlxJNQPuSljnEsK+SGx8W9h7TGmbZraXmb0RVUVYQTgRJL/Xoj67hmy9LxJLhIvbLwXZvC53X0V4zw2jWAdaqN6zPFpX7QJiTfW7XOh2kuensO2+hIuRKRaqLhXWeVEz4P+S9keTpO2m+p0t9Lvh7h8SSibuAxaY2cNmViuV957CPi7sGNQQ+NFDlZx837P1d1WktPRKODaXpPe+wr6/TYDvvfhaHA3Z+hgH237PU/0NF3XsrA/sRGhDl/8bfyeaXhKNCL9vCMeMk5OOPwcTSnmKi/3fhATpWQvVJ29NKJFK9T1hZkeb2WgLVRuXEW4U10vltQVsZ6m7ry5k+VTea7L871Qzd7/A3RNLKBPjaggscfeVSdtuVNDy0TExv7QPMzvTtlTTX0aoCpp4jJ3j0dV/wrpL0olQfSCPoj+Hoo7zJfkcirzWdPd67t7N3d8vJub8RDHxOmP/hN94ctOOoq5JsFDltBfwOKGkLVk3Qg2ZW5L2dbKngROjWjYnAp+5e/7+SPW8D6ENXh1339ndO7h7cu2A44BrS3h9W2Yowcug6KA3g/DlK2j+KuDPwBlRtTEIRc8nEKpF1CbcWYBw0Ziqk4FeZnZRAfNaE6rqFRTPUkLVrlOiOJ7N/9G5+zx3P9fdGxKqtt1vpd+N9JdAi0KS5cIkHhR+JNxZrZPwqOnuxyTML6i9Yv7JaaeEaQ2S1vunpPVWc/dUugkubJvJHiA0ct7TQ1WDK9j2My/0syPcmWyS8LxpUgxF7ZeCbF6XhV7MdgbmRgfsSwnVDetGNyqWFxArpPZdLnA7CfM9YX6R23b3ae5+GqFa0z+BFwupavIjcGPS/tjJ3ZOr3aaiyO+Gu9/t7p0IpRR7AZcU995LuI+TzQWa2NYdEDQl3GkVyYTVJBxLo2qXqSZEPwJNUzgHzCUkD4m293te1LFzEaGWTduE33dtDx01lMTv2HKz9EdCqVbiMaO6u99S3ErcfYO7X+/ubYCDCBekZ5bkPUUXyS8BtwG7RceXt9hyfClqfxS0nbpJx9nkc892vddCJJ7v5wI7m1liUpP8HUg8xuYQmpfMtdDeeQgwANgl2geT2PoY28jMEp83ZetzU3EWEkrQCvscijvOp/w5FHetWQK/IyRPq4tbMKp+W5nCq4VWI3TsMhDoD+xtZn9MWuw9QqnrBxa1JS2Iu39NSHCPZuvqmSU576fiU6An4abt6du5jtgowcu8t4DDCpvp7ksIRdj5dfprEuoiLyacJG/ajm3OJbSzusjM/pw/MfpB7kwoqi/M04QTxkkk/IjM7GQzaxw9XUo40G7a9uXbz0NHItOBLtu5irHASgsdr1SLSpDa2Zauf4cCfzezPS1ob2a7uPtCwknhj9FrzmHrpOxB4HKLOggxs9oW2talYigw0Mw6RdtsZQmdaSSoSagesMpCr1N/TpyZwmf3fBRj3ehzurAE+6Ugx1jopKcyoV3FaHf/MYrzF6JqVGZ2DaEdR0FS+S4Xtp3C1lfots3sj2ZWP7pTm39XsaDv6BDgfAuNt83MqpvZsUkXCqkq9LthZgdE26hEuOhdmxRPaezjZGMId/wvtdBwvTvhhLVNOyaRNJlKKJE7NvruXwWk2vvvWMKF7S3R77Kqmf2qgOXeAvYys9MtdK51CuEmyhvbEe/zQB8za2NmOxHaPgGbS32GENqo7QrhWGxFt18mWi7XQgck9xCqh10fzXoS6Glmv42WqWqhY5rGha5syzoPN7N9o6R5BaEKYEHHuOeBv5hZYwu9JiYOsVOZ8HksBH4xs6MJVTiL3R/JolKU8cD1ZlbZzA4mHG/ybfd7LU50rPwUuDlab3tCaU7i2GqdzOzE6IbBXwnno9GEju2csA+w0F1+ci+nuxL2YaXomN6a8L1LNb6NhDZt15nZThZ6azwrYZHijvNFndMLUuS1ZmGic2AjM7uW0EznihRfehihSc+6QuZfD8xy92FRwvgnwu9oq9o+7n4r4Vrzg+R5SZ4GLiLULnshIf5Uz/spcfePCaWED5vZ74tbvixRgpd5DwO9zayou+93Ei722hPaun1PSDi+puhkrFDu/gMhyRtkW3p7Oh14vIgfJIQqo3sC89w9sbToAGCMma2KlrnI0zPWy0OE9hUlFh1QjyM0Wv6OcPd1KKH0CEJD/OcJd41WEO4u5feGdi6hdGUx0JZw4shf7yuEO0PPWqhqOIlwJymVmF4gdILzNLCS0NahoIbaAwmfz0rCBUVyL6XFfXbXE74330Xvb/NQEynsl4I8TTixLwE6Afl33t4lVFGaGm1vLUnVKBOk8l0ubDsFKW7bPYDJ0Xf0LkL9+m06IHD38YTP+17CzYrpFNKJSnGK+W7UInyWS6N4FwODE15eGvs4OZ71hAusowmf8/3Ame4+ZXven0hJuftyQruxoYTf/mpC1bhUXruR8P1tBfwQvW6bcU89dIBwHPB/hN/VpcBx7r5oO+J9m3AO/pBwLPgwaZHLoumjo9/4+xTdscSB0TFoBaGtVi3gAI86mIgSk/zOmRYSftuXkNr1WQPgxWjd3xDaLyUPKwThuPMuocbHZ4REI//9rgT+QjgXLiWcW4YnzC9ufyQ7ndDOewnhePZEwrp25L2m4jRCzZC5wCvAtUnVEF8jfH+WEq4rToxKQb8Gbie025xP6OBtZNK6xxCuhRYRzuEneck73hhAqF47j9D++bGEecUd5ws9pxcilWvNRA2j7+kqQmcv+xJ6jnwvxdf3Jtzg3IaFXnH/RMIwCu7+H8INmG2qarr73wnXRu9b4R3ZPMOWpDLxd57Seb8kolhPAR63Qsb6K4us6Gqukg5m9jShTdyrMcZQhXCwP9TdF8QVR3GiOD8HfuPRYOcVXaY/OzMbRmiAvc34Q1I6tI9FRKQ0ZepaMyqMeMjdD0zndqRkNFhwDNw99rq8UcnPPsUuGLMozjbFLliBZMtnJyIiIvHI1LWmu39J6E1WyhBV0RQRERERESknVEVTRERERESknFAJnoiIiIiISDmhNngiIiJlhIUxm+4H1gMfuftTMYckIiJZJiuqaNarV8+bN28edxgiIpJmEyZMWOTuqQ6EnRXM7FFCN/4L3L1dwvQehK68c4Gh7n6LmZ0BLHP3183sOXffZliARDo/iohUHKmeI7OiBK958+aMHz8+7jBERCTNzOz7uGNIg2GEcRY3jwkWDU59H3AkYXy3cWY2HGgMfBUttrG4Fev8KCJScaR6jlQbPBERkTRy9xGEgZ8TdQGmu/vMaED6ZwmDQM8mJHlQyDnazM4zs/FmNn7hwoXpCltERLKUEjwREZHMawT8mPB8djTtZeD3ZvYA8HpBL3T3h929s7t3rl+/XNVmFRGRUpAVVTRFREQqAndfDZwddxwiIpK9VIInIiKSeXOAJgnPG0fTREREdogSPBERkcwbB+xpZi3MrDJwKjA81RebWU8ze3j58uVpC1BERLKTEjwREZE0MrNngFHA3mY228z6uvsvwADgXeAb4Hl3n5zqOt39dXc/r3bt2ukJWkREspba4ImIiKSRu59WyPS3gLcyHI6IiJRzKsETERHJMqqiKSIihakQCZ573BGIiIiUntKsouk6SYqIlCsVIsG75ho4/3xYuTLuSERERMqOxT8vpuvQrnww84O4QxERkVJS7hO8hQvhX/+Chx6CffeFD3QOExERAeD+cfczbu44ejzVg8cnPh53OCIiUgrSluCZ2aNmtsDMJiVM29nM/mNm06K/ddO1/Xz168OYMdCpE3z/PRxxBPTvD6tWpXvLIiIi6VFabfCuPPRKLjnoEn7Z9At9XuvDdR9dpyqbIiJZLp0leMOAHknTBgEfuPuewAfR87Rr1w5GjYK//x0qVYL774f27eGjjzKxdRERkdJVWm3wciyHW4+8lfuOuY8cy+H6j6/n7NfOZv3G9aUUqYiIZFraEjx3HwEsSZp8ApBfB+RxoFe6tp+sUiW46ioYPx46dIDvvoPDD4cLL4TVqzMVhYiISNlzwQEX8Nqpr7FTpZ14/IvHOfqpo1m2dlncYYmIyHbIdBu83dz9p+j/ecBuGd4+7dvD2LFw/fWQlwf33humjRiR6UhERETKjuP2Oo6P+3zMbtV348PvPuTgRw/mh+U/xB2WiIiUUGydrHio5F9oRX8zO8/MxpvZ+IULF5bqtitVCj1rjhsH++0HM2fCYYfBRRepNE9ERCquzg07M7rfaFrXa83khZPpNrQbn//0edxhiYhICWQ6wZtvZrsDRH8XFLaguz/s7p3dvXP9+vXTEkyHDqE079prQ2ne3XeHhO+TT9KyORERkVKRzoHOm9dpzshzRtK9eXd+WvUThzx2CG9Ne6vUtyMiIumR6QRvOHBW9P9ZwGsZ3v42KleG664Lid6++8KMGaE0729/g59/jjs6ERGRbZXmQOcFqVutLu/0foc/tv8jqzes5vhnjueh8Q+lZVsiIlK60jlMwjPAKGBvM5ttZn2BW4AjzWwacET0vEzo2DF0wHL11ZCTA3feGUr4Pv007shEREQyr0peFZ7o9QRXHXIVG30j5795PoPeH8Qm3xR3aCIiUoR09qJ5mrvv7u6V3L2xuz/i7ovd/Tfuvqe7H+Huyb1sxqpyZbjhhjBuXtu2MG0aHHww/N//wZo1cUcnIiKSWWbG33/9dx45/hHycvL458h/0vvl3qz9ZW3coYmISCFi62SlLOvUCSZMgCuuCKV5//pXKM0bNSruyERERDLvnI7n8Obpb1Kzck2enfQsR/77SBb/vDjusEREpABK8ApRpQrceGNI6tq0galTQ2nepZfCWt24FBGRGKWzk5XCHLXHUXxy9ic0qtmI//3wPw569CBmLp2Zse2LiEhqlOAV44ADQmneoEHh+eDBob3emDHxxiUiIhVXujtZKcx+DfZjdL/RtN+tPVMXT6Xb0G6MnTM2ozGIiEjRlOCloGpVuPnm0OHKPvvAlClw0EEh6VNpnoiIVCSNazXmk7M/4ag9jmLhzwvpPqw7r055Ne6wREQkogSvBLp2hc8/D9U0Af75z9Beb6xuXoqISAVSq0ot3jjtDfp27MuaX9Zw4nMnctfou+IOS0REUIJXYlWrhsRu5EjYe2/4+mvo1g369YMFhQ7bLiIiUr5Uyq3EkJ5D+Mfh/8Bx/vruX/nrO39l46aNcYcmIlKhKcHbTt26hdK8yy6DvDx45BHYc0+44w7YsCHu6ERERNLPzLjy0Ct58ndPUimnEneNuYuTXziZnzf8HHdoIiIVlhK8HVCtGtxyC0yaBMccAytWwMUXQ/v28O67cUcnIiKSGb3b9+a9M96jTtU6vDLlFX79+K9ZsFrVWkRE4qAErxTstRe8+WZ47Lln6ISlRw84/niYPj3u6EREpLyJY5iE4nRv3p2R54ykWe1mjJkzhgMfOZCpi6fGHZaISIWjBK8UHXNMKM0bPBhq1oTXX4e2bUNvmytXxh2diIiUF3ENk1CcNvXbMLrfaDrt3omZS2dy4CMH8uqUV3H3uEMTEakwlOCVssqVYeDAMDB6nz6wfn3olGXvveHf/4ZNm+KOUEREJH0a1GjAx30+pudePVmyZgm/e+53HPP0MUxbPC3u0EREKgQleGnSoAE89hiMHg1dusBPP8GZZ8KvfgXjxsUdnYiISPpUr1ydV055hXuOvofaVWrzzvR3aPdAO6744ApWr18dd3giIuWaErw069oVRo2CYcNC0pef8P3xj3DvvfDaa/DZZ7BwIagGi4iIlBe5ObkM6DKAqRdO5ewOZ7N+43pu/t/NtL6vNS9+/aKqbYqIpIllwwG2c+fOPn78+LjD2GErVsCNNxY+lELVqtCkSXg0bRr+Nm8OLVuGR6NGkJub8bBFRDLGzCa4e+e448gW2XR+HD17NP3f6s9nP30GwBEtj+DuHnfTun7rmCMTEckOqZ4jleDFYNo0ePFF+OGH8Pjxx/C3uM7QKlXaOuFr1Qr22Sc8mjVT8ici2U8JXslk2/lx46aNDPlsCFd8cAVL1y4lLyePv3X7G1cfejU1q9SMOzwRkTJNCV4WWrlyS7KX/3fWLJg5MzzmzSv8tVWrhuEaWrcO7fz+/OcwALuISDZRglcy2Xp+XPTzIq784EqGfDYEx2lYsyG3HXkbp7Y7FTOLOzwRkTJJCV45tHr11gnf1KlhzL0pU2Du3K2X/cMf4MknQ6mfiEi2UIJXMtl+fhw3ZxwD3h7A2DljgTCW3j1H30O7XdvFHJmISNmjBK+CWbEiJHoTJ8Ill4TnPXvC88+H0j0RkWygBC81ZtYT6NmqVatzp03L7uEHNvkmHv38UQa9P4jFaxaTa7l0a9yN3Jyi2x3UrFyT/gf0p0erHir1E5EKQQleBTZ+PPz2t7BkCRx5JLzyClSvHndUIiLFU4JXMuXp/LhkzRKu/vBqHpzwIJs89UFjD29+OLceeSudG+prIyLlmxK8Cu6rr+CII2DBAjj4YHjzTahVK+6oRESKpgSvZMrj+fG7pd/xw/Ifil1u7Jyx3Py/m1m6dikAp7Y7lX8c/g/22HmPdIcoIhILJXjCt9/Cb34Dc+bAAQfABx9ATXVSJiJlmBK8kqno58ela5Zy8/9u5u4xd7Nu4zoq5VTi/M7nc/WhV1O/ev24wxMRKVVK8ASA774LSd5330G/fjBkSNwRiYgUTgleyej8GPyw/Aeu+e81PPHFEzhOzco16dCgwzbL5VgOf2j7B/7c+c9qtyciWUcJnmw2eTJ06gTr1sHrr8Nxx8UdkYhIwZTglYzOj1v7cv6XDHp/EG9Pf7vI5U5pewpDjx9Kjco1MhSZiMiOS/UcqZHSKoC2beHGG2HgwFCKN2kS1KsXd1QiIiKlq/1u7Xmr91tMXjCZxWsWbzN/+pLpXPTORTw3+TkmLZjEy6e8zF677BVDpCIi6aMEr4L4619h+HAYMSIMgv7886DaKSIiUh613bVtgdMPbXYoBzY+kBOfP5HJCydzwJADeKLXE5ywzwkZjlBEJH1y4g5AMiM3F4YV1KZTAAAgAElEQVQNgxo14MUX4Zln4o5IREQk81rXb83YfmP5fevfs2LdCno914vBIwfHHZaISKlRgleBtGgBd9wR/u/fH2bPjjceERGRONSsUpMXTn6BwUcOxjCu+PAKpi6eGndYIiKlQgleBdO3Lxx7LCxbBiedBKtWxR2RiIhI5pkZAw8ayNkdzuaXTb9w+QeXxx2SiEipUIJXwZjB0KHQrBmMGQMnnABr18YdlYiISDxuOPwGquVV4+VvXmbkDyPjDkdEZIcpwauAGjSA998Pfz/8EE45BTZsiDsqERFJlZn1NLOHly9fHncoWa9RrUb834H/B8DA/wwkG4aPEhEpihK8CqpVK3jvPahbN/Su2acPrFkTd1QiIpIKd3/d3c+rXbt23KGUC5f+6lJ2rb4ro2eP5sWvX4w7HBGRHaIErwLbd194553Qs+bTT8NOO0H9+tCxI1xyCegmpoiIVAQ1q9TkusOuA+DyDy5n/cb18QYkIrIDlOBVcF26wBtvQOvWkJcHixbBxIlw223w1ltxRyciIpIZ/fbvx9677M2MpTN4YNwDcYcjIrLdlOAJhx0GX38dOluZOxeuuSZMv+oq2LQp3thEREQyoVJuJW498lYAbhhxA8vWLos5IhGR7aMETzbLzYXdd4dBg6Bhw1CS9/LLcUclIiKSGT336smhzQ5lyZol3DPmnrjDERHZLkrwZBvVqoXSOwileRs3xhuPiIhIJpjZ5rZ4d4y+gxXrVsQbkIjIdlCCJwXq2xeaN4dvvgkdsIiIiFQE3Zt35+CmB7N07VLuH3d/3OGIiJSYEjwpUOXKcN114f8rroDbbw9j5y1TkwQRESnHzIyrD70agNtH3c7q9atjjkhEpGSU4EmheveGdu1g9mwYOBCOPDIMjt6nD4wbF3d0IiIi6XFkyyPp2qgri35exIPjH4w7HBGRElGCJ4XKy4MRI+CBB+D886FbN1i/Hh5/PAyvsNtucNRRYUgFjZknIiLlhZlxzWGhS+nBnw5mzYY1MUckIpK6WBI8M/ubmU02s0lm9oyZVY0jDile3bohuXvgARg1CqZNg//7P9hlF1iwAP7znzAo+qhRcUcqIiJSeo5udTSddu/E/NXzGfLZkLjDERFJWcYTPDNrBPwF6Ozu7YBc4NRMxyHbZ489QondwoUwcyaceWaY/uyz8cYlIiJSmhLb4l32/mW0uKsFLe5qwd737s2bU9+MOToRkcLFVUUzD6hmZnnATsDcmOKQ7WQGLVrAgAHh+QsvaDgFEREpX47f+3i6Ne7G2l/WMmvZLGYtm8XUxVO56r9X4WqbICJlVMYTPHefA9wG/AD8BCx39/cyHYeUjs6doWVLmDcvtNcTEREpL8yMEX1GMPMvM5n5l5lMu3Aau1TbhYnzJvLZT5/FHZ6ISIHiqKJZFzgBaAE0BKqb2R8LWO48MxtvZuMXLlyY6TAlRWZwyinh/+eeizcWERGR0lYptxIt6ragRd0WtNq5FWfuF9omqF2eiJRVcVTRPAL4zt0XuvsG4GXgoOSF3P1hd+/s7p3r16+f8SAldadGLShffBE2bIg3FhERkXTqt38/AJ7+6mlWrV8VczQiItuKI8H7AehmZjuZmQG/Ab6JIQ4pJfvuC61bw+LFoTTvxRfDcAoiIlIyZtbSzB4xsxfjjkUK1qZ+Gw5qchAr16/khckvxB2OiMg24miDNwZ4EfgM+CqK4eFMxyGlxwwuuyz8/8orcPLJsP/+MHJkvHGJiGSSmT1qZgvMbFLS9B5m9q2ZTTezQUWtw91nunvf9EYqO+rc/c8FVE1TRMqmWHrRdPdr3X0fd2/n7me4+7o44pDSc9ZZYYy8wYPDUAqTJ8PBB6tdnohUKMOAHokTzCwXuA84GmgDnGZmbcxsXzN7I+mxa+ZDlu1xcpuTqVWlFqNmj2LygslxhyMispW8uAOQ8qNVKxg4EPr3h0GD4O674dZbt3TCIiJSnrn7CDNrnjS5CzDd3WcCmNmzwAnufjNwXGYjlNJSvXJ1Tm93Og9OeJDDhh1GzSo1C1wuLyeP67tfz+n7np7hCEWkIotrHDwpx6pVg3/+E2rXhs8+g6++ijsiEZHYNAJ+THg+O5pWIDPbxcweBDqa2eWFLKNepsuAAV0GUDm3MovXLN48Rl7yY/qS6dw79t64QxWRCkYleJIWVauG3jUfeggefxxuuy3uiEREyj53XwycX8wyDxO1Xe/cubNG245J213bMn/gfJauWVrg/BXrVtDhoQ58Of9LNm7aSG5OboYjFJGKSiV4kjZnnRX+Pvkk/PJLvLGIiMRkDtAk4XnjaJqUA3Wq1tk8Rl7yY78G+9GkVhNWb1jN9CXT4w5VRCoQJXiSNt26wV57wfz58IJ6khaRimkcsKeZtTCzysCpwPCYY5IM6dCgAwAT502MORIRqUiU4EnamMGFF4b/zz0XJkyINx4RkXQys2eAUcDeZjbbzPq6+y/AAOBdwpivz7v7Dne7aGY9zezh5cuX7+iqJI06NugIwOfzPo85EhGpSJTgSVr17w9nngmrV8Oxx8KSJXFHJCKSHu5+mrvv7u6V3L2xuz8STX/L3fdy9z3c/cZS2tbr7n5e7dq1S2N1kiYqwROROCjBk7QygyFDoEuXUFXz5ZfjjkhERCQz8hO8z+d9jrv6wxGRzFCCJ2lXuXKooglK8ERESoOqaGaH5nWaU7tKbRasXsC8VfPiDkdEKggleJIRJ5wAOTnw/vug6xERkR2jKprZwcy2KsUTEckEJXiSEfXrw6GHwoYN8OabcUcjIiKSGZs7WvlJCZ6IZIYSPMmYE08Mfx9+GOZoFCgREakANne0Ml8drYhIZijBk4z53e+galX4+GNo1gyeeCLuiEREspPa4GWPjruHErzh3w6n5V0tNz9Oev4kNm7aGHN0IlIeKcGTjGncGP7731CSt3Ej/OlP8OWXcUclIpJ91AYve7Su15oWdVqwfuN6vlv23ebHS9+8pHZ5IpIWSvAko7p1g5degn79YO1aOOWUkOyJiIiUR5VyK/F1/6+Z8ZcZmx+ntjsVgE++/yTm6ESkPFKCJ7G4+25o0gSmTIHx4+OORkREJH2q5lWlZd2Wmx+/3eO3AHzygxI8ESl9SvAkFtWqQc+e4f933ok3FhERkUw6pOkhQEjwNAC6iJQ2JXgSmx49wt+33443DhGRbKNOVrJby7ot2b3G7iz6eRFTFk2JOxwRKWeU4ElsDj8cKleGsWNh8eK4oxERyR7qZCW7mRmHNjsUgBHfj4g5GhEpb5TgSWxq1IBDDgH3kOzdfXfcEYmIiGRGYjVNEZHSpARPYnXyyeHvV1/BpZfChg3xxiMiIpIJhzQLCd5TXz1F7g25XPfRdfEGJCLlhhI8idV558EXX0DTprBuXehVU0REpLxrt2s7ftXkVwBs8k08+vmjMUckIuWFEjyJlRm0bw9duoTnEybEG4+ISDZQJyvZL8dy+OTsT/jl6l+oWbkmP674kZ9W/hR3WCJSDijBkzJh//3D388+izcOEZFsoE5WygczIzcnlwMaHQDAuLnjYo5IRMoDJXhSJnTqFP4qwRMRkYqmS8NQjWXM7DExRyIi5YESPCkTOnYMfydOhI0b441FREQkk7o0Cgne2LljY45ERMoDJXhSJtSvD02awOrVcMUVsHJl3BGJiIhkRtfGXQEYN2ccm3xTzNGISLZTgidlxrHHhr+33gqnnRbGxxMRESnvGtZsSKOajVi+bjmTF0zGdQIUkR2gBE/KjPvug//8B2rXhjffhCeeiDsiERGRzMivptn+wfZ0eKgD635ZF3NEIpKtlOBJmZGTA0ccAXfeGZ6ffz688gosWxZvXCIiZY2GSSh/ztzvTGpVqYVhfDn/S4Z/OzzukEQkSynBkzLnrLPCAOhr18KJJ0LDhvDAA6qyKSKST8MklD+99unF8kHLufvouwEY+vnQmCMSkWylBE/KHDO4/34YNCgMgr5mDVxwAfTqBYsWxR2diIhI+vTetzdVcqvwnxn/YdayWXGHIyJZSAmelEm5uXDzzfDFF/Dss6Fd3vDhYTiFZ57RUAoiIlI+1a1Wl5PanITjPPGFGqOLSMkpwZMy75RTQqJ34IEwezacfjrstx98+23ckYmIiJS+E1ufCMCnP34acyQiko2U4ElWaNYMRoyAoUPD/5Mnw29+E3refPtt+PnnuCMUkWxnZgea2X1m9qWZLTSzH8zsLTPrb2Zq7CYZ07FBRwAmzpsYcyQiko2U4EnWyMuDvn1Dcte5M8yZAwMGwDHHwB57wKhRcUcoItnKzN4G+gHvAj2A3YE2wFVAVeA1Mzs+vgilImlepzm1qtRi/ur5zFs1L+5wRCTL5MUdgEhJVa8OH34Ijz8On38OY8aEpO+QQ0Kp3n77wTnnwD77xB2piGSRM9w9uRunVcBn0eN2M6uX+bCkIjIz9tttPz754RO+mPcFDVo1iDskEckiKsGTrFSzZii9e+SRkORdfDFs2gTvvQeDB4cSvlmz4o5SRLJFAcnddi0jUlo6NOgAqJqmiJScEjzJepUqwe23w4wZ8PLL0KkTrF4N99wTd2Qikm3MbKWZrSjgsdLMVsQdXz4NdF7+bU7w5ivBE5GSiSXBM7M6ZvaimU0xs2/M7MA44pDypUUL+N3v4KGHwvN77oH33483JhHJOncCg4BGQGPgMuBOd6/p7rVijSyBBjov//ITvLFzxjJ+7njcPeaIRCRbxFWCdxfwjrvvA+wHfBNTHFIOdeoUqmxu2ADHHw9HHw2vvRZ3VCKSJY539/vdfaW7r3D3B4AT4g5KKp629duSl5PHzKUzOWDIATz55ZNxhyQiWSLjCV7U1fShwCMA7r7e3ZdlOg4p3wYPhrPPhjVr4J13oFevUMJ33HFw550wZUrcEYpIGbXazHqbWa6Z5ZhZb2B13EFJxVMlrwq3HnEre++yNwAjfxwZc0Qiki3iKMFrASwEHjOzz81sqJlVjyEOKcdycuDRR2HqVLjmGqhSJXS68uab8Le/Qdu2cOWVMHYsrF0bd7QiUoacDvwBmB89To6miWTc3w78G3f1uAuAbxapspOIpCaOBC8P2B94wN07Eu6MDkpeyMzOM7PxZjZ+4cKFmY5Ryok994Trr4clS+Cbb+Cxx0Ip3qZNcNNN0LUrNG8O48fHHamIlAXuPsvdT3D3eu5e3917ufusuOOSiqtN/TYAfLNQCZ6IpCaOBG82MNvdx0TPXyQkfFtx94fdvbO7d65fv35GA5TyZ6edwrh4ffrA8OHwxBPQuzc0bQrz50OPHqGdntqwi1RsZraXmX1gZpOi5+3N7Kq445KKq3GtxtSoXIOFPy9k0c8aqUNEipfxBM/d5wE/mtne0aTfAF9nOg6puMzgjDPgySdh+nQ48EBYvDi00+vYEV59Ne4IRSRGQ4DLgQ0A7v4lcGqsEUmFZmbsU28fQKV4IpKauHrRvBB4ysy+BDoAN8UUh1RwlSqFoRTuvBPq1oUvvoCTTgrt9FQzWKRC2sndxyZN+yWWSEQireu1BtQOT0RSE0uC5+4To+qX7aP2DUvjiEMEQvXNiy6CH3+E006DjRtDwnfwwWqbJ1IBLTKzPQAHMLOTgJ/iDUkqOrXDE5GSyEtlITPrDBwCNATWAJOA/ygxk/KkenV46ik45RT4059CD5zdusGFF4aqm82aheqclSvHHamIpFF/4GFgHzObA3wH9I43JKno8kvwHhj/AM9Ofpadq+3M66e9Tsu6LWOOTETKoiJL8MzsbDP7jNAeoRrwLbAAOBh438weN7Om6Q9TJDPM4IQTwvAJfftuKc076yzo3h0OOAAWqY27SLlkZjlAZ3c/AqgP7OPuB7v79zGHJhXcgU0OpE7VOqzbuI55q+bx9cKveXjCw3GHJSJllHkR3QaaWX/gUXdfU8j8DkA9d38/TfEB0LlzZx+vunISg5dfhpEjYd48eOEF2LAhVN0cMSIkgyJSusxsgrt3jnH74+PcfqrMrCfQs1WrVudOmzYt7nAkA9ZsWMOytcsYO2csvZ7rRcu6LZl+4XRMJyORCiPVc2SRCV5ZoQRPyoJJk6BDh1Cq16MHvPRSaL8nIqWnDCR4twCLgOcI47QC4O5L4oqpKDo/VjwbN22k0b8aMX/1fIb0HEKjmo0A6NSwE7tW3zXm6EQknVI9R6baBu8eogbnBXH3v5QgNpGs1K4d3HUXDBgA77wDLVrA1VdD//4qzRMpR06J/vZPmOaAGjtJmZCbk8uJrU/kgfEPcO7r526e3qZ+GyZfMDnGyESkrEgpwQOqAG0IdzQBTiaMXTcqHUGJlFUXXABVqsDDD8O4caEDlp9+guuvh7xUf00iUma5e4u4YxApzqW/upSFPy9k1fpVAIz4fgRfL/ya75Z+R4u6+gqLVHSpDpPQHuju7ve4+z2Ewck7uPvj7v54+sITKVvMoF8/GDMGLr44TLvpJujcWUMqiGQzM7sp4f8j44xFpDjN6zTnhZNf4O3eb/N277c5ao+jAPjvrP/GHJmIlAWpJnh1gVoJz2tE00QqJDO49Va48UaoVy8MkN61KwwbFndkIrKdeiT8/8/YohDZDr9u/msAhn87nK/mf7XV49tF35IN/S2ISOlJtVLZLcDnZvZfwIBDgevSFZRINsjNhSuugHPPDVU1n3sulO41bgxHHBF3dCIiUlEc3uJwAF779jVe+/a1bebf0P0Grj7s6kyHJSIxSSnBc/fHzOxtoGs06TJ3n5e+sESyR/368Mwz0KBB6ITl6KNDG72zz447MhEpgV3N7GLCTcz8/zdz93/FE5ZI8drWb0u/jv0YPWf0VtPXbFjDjKUzeHfGu0rwRCqQVHvRNOAIoKW732BmTc2si7uPTW94ItnBDG6/HSpXhsGD4ZxzoHZtOPHEuCMTkRQNAWoW8L9ImWdmDDl+yDbTF/28iPqD6/P5vM/ZuGkjuTm5MUQnIpmWahXN+4FNwK+BG4CVwEvAAWmKSyTr5OaGdnlLl8LQofD738Ppp8PAgbDffpCTaotXEck4d78+7hhESlu9nerRrHYzvl/+PVMWTaHtrm3jDklEMiDVS86u7t4fWAvg7kuBymmLSiSLXX01HHts+P/pp2H//UO7vAED4L33QG3dRcoeM7vKzArtPMzMfm1mx2UyJpHS0KlhJwAOG3YYTe5oQou7WvDvL/4dc1Qikk6pJngbzCyXaLBzM6tPKNETkSRNm8Ibb8DEiaHTlZo1w1h5990Hv/0t9OoFa9fGHaWIJPkKeMPMPjCzwWZ2qZldY2b/NrOvgJ7AmJhjFCmx4/c6HoDFaxYze8VsZi2bxf3j7485KhFJJ0ul61wz6w2cAuwPPA6cBFzl7i+kN7ygc+fOPl6DjEmW+vln+Oij8Lj7bli3DqpWhUsvhYsugp13jjtCkbLDzCa4e+cYt78n8Ctgd2AN8A0wwt3XxBVTUXR+lFQs+nkRazasYeX6lbS9vy3V8qqx8vKVapMnkmVSPUemlOBFK9yHMMC5AR+4+zc7FmLqdAKT8mLsWDjjDJg6NTyvVCn0utmzZ+h1M1fnWqng4k7wso3Oj1JSLe5qwaxls5h8wWTa1G8TdzgiUgKpniOL7WQlqpo52d33AaaURnAiFVWXLjBlCowYAVddBf/7HwwfHh633AJ9+0KfPrD77nFHKlIxmdlewECgOQnnSHf/dVwxiZSmjg06MmvZLDo+1JFc23JXcdfqu/Jxn49pVqdZjNGJSGkotg2eu28EvjWzphmIR6TcM4PDDoNPPgnt9G64AXbdFWbMCAOnN2wIF18MX38dd6QiFdILwOfAVcAlCQ+RcuHkNieTl5PH+o3rWfPLms2P75d/zzvT34k7PBEpBal2slIXmBw1Ph+e/0hnYCIVwX77hV43Z84M7fNq1QrT77gD2rYNyZ+IZNQv7v6Au4919wn5j7iDEiktp+17GqsuX8XqK1Zvfvzj8H8A8PVC3VkUKQ9SHQfv6rRGIVLBVa8OF14If/4zvPwyXHcdfPMNXHtt6Hmza9e4IxSpMF43swuAV4B1+RPdfUkmNm5mvYBjgVrAI+7+Xia2KxVLlbwqWz3fr8F+AHy14CtWrlu5zfJV86pSKbdSRmITkR1XZCcrZtbN3UdnMJ4CqRG5VDSbNkHHjvDll+H5gAFw001hyAWR8izuTlbM7LsCJru7t0zhtY8CxwEL3L1dwvQewF1ALjDU3W9JYV11gdvcvW9Ry+n8KKVh5tKZ7HH3HoXOr12lNhPPn0jzOs0zF5SIbCPVc2RxVTQ3D5RiZqN2OCoRSUlODowaFQZJB7j33lB987jj4NNP441NpDxz9xYFPIpN7iLDgB6JE6KOyu4DjgbaAKeZWRsz29fM3kh67Jrw0qui14mkXfM6zenRqgc1KtfY5pGXk8fydcsZ8f2IuMMUkRQVV0XTEv6vms5ARGRrO+0UhlV44onQ4+bcufDmm+HRpUtos6eqmyKlw8xOLGq+u79c3DrcfYSZNU+a3AWY7u4zo+08C5zg7jcTSvuS4zDgFuBtd/8stehFdkyO5fB277cLnHflB1dy0/9uYsaSGRmOSkS2V3EJXk5UTSQn4f/NSV+m2iSIVFS5uWF8vLPOgo8/httug7feColft25w6KHQq1dou1dVt2BEdkTPIuY5UGyCV4hGwI8Jz2cDRd2auRA4AqhtZq3c/cHkBczsPOA8gKZN1cG1pNceO4eqm2PmjOHjWR9vnl4lrwoHNDxAg6WLlEHFtcGbBWxi65K8fCm1SSgNamMgssXMmSHh+9//tkyrXRtuvx3OOScMwyCSreJug7ejohK8N/Lb4JnZSUAPd+8XPT8D6OruA0pjezo/SrqN+H4Ehw07rMB5N3S/gasPUz98IplSKgOdu3vzUotIREpFy5ZhDL0ffoChQ+GBB2DRIujXDwYPhueeC8MviEiZMAdokvC8cTRNJCsc1OQgzu5wNjOWbqmiuWTNEiYtmMSEnzSCiEhZVGQnKwW0JUieb2bWuDQDEpHUNG0axsmbOxeuuSZM+/Zb6NQJPv883thEZLNxwJ5m1sLMKgOnAhpHVrJGXk4ej57wKB/3+Xjz49HjHwVg1rJZ8QYnIgUqrg3eYDPLAV4DJgALCZ2ttAIOB34DXEtoUyAiMahUCa6/Hvr3h4MOghkzQu+b550Xqm3WqBF3hCIVg5k9A3QH6pnZbOBad3/EzAYA7xKGSXjU3SeXwrZ6Aj1btWq1o6sSKbH84RK+XfwtJ79w8lbzeu3di97te8cQlYjkK7INHoCZtQF6A78CdgfWAN8AbwIvuvvadAepNgYiqVm2DI4/PlThhDDcwn//GzpjEckGZaENnpnt4+5T8v/GGUtxdH6UOLg7jf7ViJ9W/bTNvLpV67LkMvXBJ5IOpdIGD8DdvwauLJWoRCSt6tSBESPgH/+Aq68OA6Yfdliotnn33aGET0SK9TSwf8JfEUlgZow8ZyTj5259c+G0l05j6dqlbNi4gUq5lWKKTkSKTfDymdlBQPPE17j7E2mISUR20FVXwe9/DwMGwIcfwoQJ8KtfhSEXHnlEPW2KpKjM/lJURVPi1qJuC1rUbbHVtP5v9WfhzwtZvGYxDWo0iCkyESmyk5V8ZvZv4DbgYOCA6JG13ViLVAStW8MHH8C0aXDwwWHaY49Bgwbw+ONQTO1sESnD3P11dz+vdu3acYcistkuO+0CwKKfF8UciUjFlmoJXmegjRfXYE9EypxWrUKbvJNOgpdeggULoE+f8Lj44tATZ/XqcUcpIiLZrt5O9QA4+7WzqVO1DgB77rwn9xx9jwZEF8mglErwgEmAytpFstiLL8Ibb0CHDlum/etfoZfNAw6ApUvji02kjNJNTZESaF2vNQDj547n/Znv8/7M93lg/AN89tNnMUcmUrGkmuDVA742s3fNbHj+I52BiUjpO/bYMEbenDmht81848fDzjvDgw/GF5tIGWJJf8scM+tpZg8vX7487lBENrv76Lv58MwPee+P7/HeH9+ja6OuAMxfPT/myEQqllQTvOuAXsBNwO0JDxHJQg0bwmuvhV42r7tuy/Q//zkkehMnxhaaSFlwSNLfMkdt8KQsqppXlcNbHM6RexzJkXscyT719gFgweoFMUcmUrGklOC5+8cFPdIdnIiklxlcey3MmrVl2tKl0LEjXHIJrFoVW2gisXH3VYl/RWT77Fp9VwAGvDWAXW7dhV1u3YV+w/vFHJVI+VdkgmdmK81sRQGPlWa2IlNBikh6NWsWSvOeSBj45LbboGbNUKonIiJSUoc1O4y8nDzW/LKGJWuWsGTNEh6b+BgbN22MOzSRcq3IBM/da7p7rQIeNd29VqaCFJH0M4MzzoC1a2GPPbZMf/DBMG/YsNhCExGRLHTsXsey9LKlLLpkEYsuWcQu1XZhk29i4c8L4w5NpFxLeaBzEakYqlSB6dNh3Tpo0QJ++ilMP/tsOP98WLYMqlaNN0aRdDGzTu4+IWnace7+RlwxFUQDnUu2qFG5BjUq1wBg95q7s3jNYk56/iSqV94yPk+lnEoMOngQBzc9OK4wRcqV2BI8M8sFxgNz3P24uOIQkYJVqQJz58LYsdA1dITGunVQrRr87W+hCmdOqt00iWSPIWZ2prtPAjCz04C/AmUqwXP314HXO3fufG7csYikqt2u7Zi0YBIjfxy5zbwqeVWU4ImUkjhL8C4CvgFU1VOkDOvSBdzhppvgyivDtDvuCIOmT5sGlSvHG59IKTsJeNHMTif0onkmcFS8IYmUD0N7DqVvx75btcH7fN7nXP7B5SxbuyzGyETKl1gSPDNrDBwL3AhcHEcMIlIyV1wB554LBx4IM2bADz+EUr4JE2D//eOOTqR0uPtMMzsVeBX4ATjK3dfEHJZIuVC9cnWOaHnEVtNqVw1DfaxctzKOkETKpbgqWN0JXApsimn7IrId6tcPpXZ9+26Z1lH/0TgAAB8CSURBVKkTDB4cSvlEspWZfWVmX5rZl8CLwM5AC2BMNE1E0qBm5ZoArFyvBE+ktGS8BM/MjgMWuPsEM+texHLnAecBNG3aNEPRiUhxzGDoULjggpDcAVx6Kfzzn2E8vRo1Yg1PZHtlVVtwdbIi5UXNKiHBm75kOnvfu/c28yvnVmbwkYPp0apHpkMTyVpxlOD9CjjezGYBzwK/NrMnkxdy94fdvbO7d65fv36mYxSRYuy/f+iEpVq18Hzx4jBu3rhx8cYlsp3mA78DLgF6EDoA+z7/EW9o23L31939vNq1a8cdisgO2a36bjSs2ZBfNv3C1MVTt3lMWjCJYROHxR2mSFbJeAmeu18OXA4QleANdPc/ZjoOEdlxu+8Oq1fDgAFw//1hWpcu8Je/wJ13htI+kSzxOLAB+AQ4GmhD6AxMRNKoSl4Vpg6YyuwVs7eZN/LHkfQd3pd5/9/evYdZVdeLH39/ZhhgAEEMr1xEUfGCoTnlrU5iXshEpDTt9zuZ2qOVWZKmmU+Zdn6lhaef9ejx95DXk6bmFTmZl+MFtSQUsaOChqIIiIIXRLkPfH9/rD0wM8zoDMzsNXvv9+t51jN7fdfae31Y6Hz47O9lffgmi5Ytavq+6h7r5+9Jasrn4EnaLBFw5ZVw5JEwdmzW9rvfZcM4//lPGDgw3/ikNtozpbQ3QERcA0zLOR6pYvTu3pvhAzYenrl01VIApsydwraXbdvkWFVUcdOXb+LEEScWJUaplOT6FKuU0qM+A08qD8ccA8uXb9hfvhwGDYKJE/OLSWqHNQ0vUkr1eQYiKbP3tnvz2SGfZeteWzfZetf0Zl1ax9T5U/MOUeqSfEyxpA5TWwv19XDppRvavvUtGDwYFizILy6pDUZGxNLC9gHwyYbXEbE07+CkStSzW08eP+VxFp27qMk24fAJAKxY4xNMpJZY4EnqUNXV8KMfwfxG0ynmz8968046qWkvn9RVpJSqU0p9C9sWKaVujV73zTs+SRv0qukFwIp6CzypJc7Bk9QpBg6Edevgoovg5z/P2v7wh2x75RXYeedcw5NKmo9JUCWrrcmWb565eCYTpzedBzBwi4EctetRhKt8qYJFKoGnE9fV1aWnn3467zAkbaIPP4TRo+Gvf93QNn48/OY3rrSppiJiekqpLu84SoX5UZXogVce4Mgbj2z1+NOnPc1+O+xXxIik4mhrjrQHT1Kn69MHnngCrrsOTj01a7v8cnj6aXjsMYs8SVLbjRo6igv/5UIWfriwSft9L9/HvKXzeGvZWzlFJnUNFniSiuaUU2DcuOz5eStXZkVfVZVDNiVJbVdTXcPFoy7eqP3YW45l3tJ5rF67OoeopK7DRVYkFdWWW8L778NBB21oGzYMrroqv5gkSaWvR7ceAKyqX5VzJFK+LPAkFV337tl8vOuv39B2xhmw//6weHFuYUmSSlj36u4ArFprgafK5hBNSbn5xjfggANg992z/WnTYJtt4K674Nhj841NklRaelRnPXg/fOCH/HzKz5sc69ujLzccewN7b7t3HqFJRWUPnqRcDR+erbL5gx9saBs3Livw6uvzi0uSVFr22W4fABYvX8wr773SZJvx5gzuevGunCOUisPHJEjqMv7xD9hnnw37e+4J990HgwfnF5OKy8cktI/5UWpq/tL5rKxf2aTt6meu5ld//RWD+g5i6JZDAaiKKs7a/yy+vMeXc4hS2jQ+JkFSyRk5Mltdc4cd4N13YeZMGDIEJk+Go4/OOzqp6/BB51LLBvUdtFHbwYMPBrLib/7S+evbV69dbYGnsuQQTUldSo8esGABfPObG9rGjIEf/QhWOW9eAiClNDmldHq/fv3yDkXq8o7e7Wie+85zPHbyYzx28mNcP/Z6AD5Y9UG+gUmdxB48SV1Oz55w9dUwejQcf3zW9utfw223waRJsLdz5CVJbRQRjNhmxPr9HbbYAYBla5blFZLUqezBk9RlHXccvPgiNHRSvPoqfPKT8MAD+cYlSSpdfbr3AbIevLeXv71+e2f5O5TC2hTSx7HAk9SlDR8Oc+fC+edvaDvyyOzB6EuX5heXJKk09e7eG4B3VrzD1hO2Xr8NmDCAUyadknN00uazwJPU5fXrB5dcArffvqHtjDPgkEPgjTdyC0uSVIJ61/Tmq3t9lU/UfmL91r9nfwAeee2RnKOTNp9z8CSVjK98BR5+OJuX9847MGMGDBsGixdDnz55RydJKgURwa3H3dqkbcnKJfT/VX8WL1vMb6f+dqP39O3RlxNGnECvml7FClPaZBZ4kkrKqFEwZw6MHw/XXZc9VmGrrbK2QRuvji1J0sfq26Mvtd1qWVG/gvH3j2/xnBX1Kzjj02cUOTKp/SzwJJWcvn3h2mthwACYMAHWrMkehv7LX8KPf5x3dJKkUlMVVdz8lZt5+NWHNzo2dcFUpi2YxuJli3OITGo/CzxJJevXv8567xqKugsugPfey+bnDR2aa2iSpBIzdvexjN197Ebtv3jsF0xbMI3Va1fnEJXUfi6yIqmknX9+NjyzwYQJWcH32mu5hSRJKiPdq7sDWOCpZFjgSSp5O+0Ef/0rnHZatn/LLVnbpEn5xiVJKn09uvUALPBUOhyiKaksHHQQ7LUXvP46PPssvPUWnHtu9lD03/0OqqvzjlDqOBExBhizyy675B2KVPYaevBmvT2L2164rUn74cMOd2VNdTkWeJLKRr9+cN99cOutcOKJMHt2to0aBUcdBb3MwSoTKaXJwOS6urrT8o5FKne9a7IHoz8450EenPNgk2PnHHgOlx1xWR5hSa2ywJNUdo4/HoYMyR6lMG3ahv2XX4aamryjkySVkjHDx/Dt/b7N2yveXt82f+l8ps6fyuvvv55jZFLLLPAklZ2qKjjwwGwBlp/8BF56KRu6eeGF8LnPZb15kiS1xZY9t+Sqo69q0jbpxUkce+uxrKxfmVNUUuss8CSVrXHjsq2uDqZPh0svzVbZXLwY+vfPOzpJUqmqrakF4PHXH+fz13++ybFxu49j/AEtPyxdKgYLPEllb+JEuPNOuOoqePddOOEE2Hln+Pd/h969845OklRqhvUfRhAsWbmEx+Y+1uTYMwufscBTrizwJJW9T30q26ZPzxZhebAwR/7II7MePkmS2mPYVsOY/b3ZzF86f31bIjHqhlEsW72MlBIRkWOEqmQWeJIqxk03weOPwxVXwH//N1xwQfb6rLPgmGPyjk6SVEqGbTWMYVsNa9LWo7oHq9auYtXaVfTs1jOnyFTpLPAkVYyttoKxY+Gf/8wKvBdfzLaVKy3wJEmbr1dNL1atXcW4W8dRU9Xyss1VUcW3677N6F1GFzk6VQoLPEkV5+yzs9U0Z82CU0+FmTPh61+HT3wCLroIttwy7wglSaVo6JZDee/N97jv5fs+8rzFyxdb4KnTWOBJqjjV1XDAAbDbbtlz8ZYsgRtvzI7tsw+cfHKu4UmSStR9/3ofU+dPbfX47Hdm88MHf8iq+lVFjEqVxgJPUsXaaiv429+ynrybboL774cpU6BPH9huOzj4YHCOvCSprbbpvQ3HDG99zP8zC58BYG1aW6yQVIGq8g5AkvJUV5cNzzzggGz/+uvh+OOzIZxPPplraJKkMlMd1QCsXWeBp85jD54kAaecAnPnwgcfwLRpMG8evPYaHHRQ3pFJkspFdVWhwLMHT53IHjxJAnbcEa67Dm6/HY44Imv76U/hwANh1CiY2vqUCkmS2sQePBWDPXiS1Mzuu2c/58zJNoBrrtkwjFOSpE1hD56Koeg9eBExOCIeiYiZEfFCRJxV7Bgk6aOccw7MmJEtwHLxxVnbhx/mG5MkqfTZg6diyKMHrx44J6X0TERsAUyPiAdTSjNziEWSNhKRPS4BYNGi7Ocdd8CAAdnrIUPg0Uehb99cwlMZi4g9gLOAAcBDKaWrcg5JUgeyB0/FUPQCL6W0EFhYeP1BRMwCBgIWeJK6nJEjYYstssVX3nkna3vnHZg+PZubJzWIiGuBo4FFKaURjdpHA78FqoGrU0qXtvYZKaVZwLcjogr4T8ACTyojDT14S1Yu4dInWv1VQE1VDSeOOJGBfQcWKzSVkVzn4EXEUGBf4O95xiFJrRk6NOvFaxiieeKJ8NBDsGJFrmGpa7oeuIKsMAMgIqqBK4HDgfnAUxFxD1mxd0mz95+aUloUEccA3wH+UIygJRVPn+59CIKlq5by44d+/JHnPr/4ea4be12RIlM5ya3Ai4g+wB3A+JTS0haOnw6cDjBkyJAiRydJG/TsmW2wYVjmRRfBxInQrRucfbaPUxCklB4rfHHZ2GeAl1NKcwAi4hZgbErpErLevpY+5x7gnoj4M/DHzotYUrH1r+3PzV+5mRlvzmj1nNnvzubOWXfy7op3ixiZykkuBV5E1JAVdzellO5s6ZyU0kRgIkBdXV0qYniS1KqhQ7OfTz21oW3tWrjrrlzCUdc3EJjXaH8+sH9rJ0fEIcCXgR7Ava2c4xegUgk7YcQJnDDihFaP3zv7Xu6cdSdr1q4pYlQqJ0Uv8CIigGuAWSml3xT7+pK0OX7xCzjsMFi9Gp59Nltlc9myvKNSuUgpPQo8+jHn+AWoVMa6VWX/PF+zzgJPmyaPB50fDHwdODQini1sR+UQhyS1W20tHHUUHHssHHpo1rZyZb4xqUtbAAxutD+o0CZJLaqpqgGgfl19zpGoVOWxiuYTQBT7upLU0Rrm5T3xBPTuvaH9iCPgzjuzxy2o4j0F7BoRO5EVdicC/yvfkCR1Zet78ByiqU2U6yqaklTKhg+HgQNhwQJYvnxD+913Z0M4e/TILzYVX0TcDBwCDIiI+cDPUkrXRMSZwP1kK2dem1J6oQOuNQYYs8suu2zuR0nqYmqqsx68Fxa/wLG3HNvqedv03oYJh0+gX89+xQpNJcICT5I2Ub9+MHdu0yGa222XPVJh1SoLvEqTUvpaK+330sqCKZtxrcnA5Lq6utM68nMl5W+HLXYgCJasXMKklyZ95LmH7XwYX93rq0WKTKXCAk+SNkN1ddPhmT16ZAXe6tX5xSRJKl1D+g1hxrdm8OqSV1s95/KplzNl7hRW1jsJXBuzwJOkDtS9e/bzwguhT58N7dttB9/7HtTU5BOXyotDNKXyNnK7kYzcbmSrx+9+8W6mzJ3C2nVrixiVSoUFniR1oAEDYOFCuOqqjY/ttRcceWTxY1L5cYimVNkaFmJxpU21xAJPkjrQH/8I9zabbXXjjfDcc/D++/nEJEkqL9VRDcDaZA+eNmaBJ0kdaMSIbGvsH//ICjzn5UmSOoI9ePooeTzoXJIqSsO8uzU+0kgdJCLGRMTE9+0WlipSdVWhB885eGqBPXiS1MkaFl554w2YM2fj4zvumK3GKbWVc/CkymYPnj6KBZ4kdbKGAu8nP8m25g49FB56qLgxSZJKV8McvAsevoCLp1zc5veN2GYEU06esv5h6ipPFniS1MmOOw4eeQRWrGjavnYtvP56NkdPkqS2OmjwQXT7ezdWr13N6rVtn+D95PwnmfPeHIYPGN6J0SlvFniS1MkOOQReeGHj9iVLoH9/qHeEjSSpHcbtMY6l5y9lzbq2T+6um1jH7Hdnsy6t68TI1BVY4ElSTroVfgO7+IraywedS6qtqaWW2jaf3zAs0wKv/LmKpiTlpKHAswdP7ZVSmpxSOr1fv355hyKpRFRF9s9+n51X/izwJCknDY9PsMCTJHW2hgLPHrzy5xBNScpJVeErtnXrsjl6ES2fV1sLO+1UvLgkSeWnYeVNC7zyZ4EnSTmJyHrx1qyBESM++twrroDvfrc4cUmSyo89eJXDAk+ScnTOOTBpUuvH330X3noLZs4sXkzq+lxkRVJ7rZ+Dt845eOXOOXiSlKNLLsmKt9a2Cy/MzltrPlYjLrIiqb3swascFniS1IVVZ1MmWGc+liRthuoq5+BVCgs8SerCGgo8e/AkSZvDxyRUDufgSVIXZoEnSeoIDQXeLx//Jdc9e91Gx7tXdWf8AeOZtmAaj859dKPjx+x2DOP2GNfZYaoDWOBJUhdmgSdJ6gjb9t4WgPtfub/Vc1avW82N/3Mj9es2fkDrX2b/xQKvRFjgSVIXZoEnSeoIVx51JUfvdnSLq2j+bd7fuHrG1SxbvYz6dfVURzW/H/N7AJavWc6ZfzmTlfUrix2yNpEFniR1YRZ4aomPSZDUXlv33pqTRp7U4rHu1d25esbV6+fndavqxin7ngLA+yvf58y/nOniLCXEAk+SurCGAu+11+CWW9r+vm23hUMOyR6mrvKTUpoMTK6rqzst71gklb4oJIuG3r1olDwaXidS8QPTJrHAk6QurLY2+zltGnzta+177+OPw2c/2/ExSZLKS/MVNhv2G79OyQKvVFjgSVIXduihcNZZ8OabbX/PE0/AggXte48kqXIFzXrwiI2OOUSzdFjgSVIX1qsXXH55+95z/PFw++0+HF2S1Dbrh2i20IPnEM3S44POJanMVBV+s1vgSZLaYv0QzRbm4DlEs/RY4ElSmWnIy+ZiSVJbrB+i2VIPnkM0S44FniSVGXvwJEnt0VDQNRRxjefgre/Bc4hmybDAk6QyY4EnSWqPNj0mwWEhJcMCT5LKjEM0y19EjImIie+//37eoUgqAw7RLC8WeJJUZuzBK38ppckppdP79euXdyiSysBGi6w4RLOkWeBJUpmxwJMktUdbHpMADtMsFRZ4klRmHKIpSWqPj3pMAmzo0bMXrzRY4ElSmbEHT5LUHs3n2TXuwQMXWik1FniSVGYs8CRJ7dF8iGbjOXiN911opTRY4ElSmbHAkyS1R/Mhms178FxopbTkUuBFxOiIeCkiXo6I8/OIQZLKlXPwJEnt0byHbqM5eA7RLClFL/Aiohq4EvgisCfwtYjYs9hxSFK5sgdPktQeDtEsL91yuOZngJdTSnMAIuIWYCwwM4dYJKnsNBR4d9wBc+YU97qXXVa860mSOkbDEMzFyxY32W9+/LwHz6Omuqa4wZWJQ3c6lKN3O7oo18qjwBsIzGu0Px/Yv/lJEXE6cDrAkCFDihOZJJWB/v2zn48+mm3FUl1tgSdJpah/zyxxfLD6g2y/tn/T47X9WfHBCq546oqix1Yuuld3L+sCr01SShOBiQB1dXUO+JWkNjr7bNh+e1ixorjXbTZlQ5JUIvbZbh/+dNyfmLd0HkEwepfRTY7fc+I9TJk7JafoysOnd/h00a6VR4G3ABjcaH9QoU2S1AG23BK+8528o1BniogxwJhddtkl71AklYGI4Pi9jm/1+H477Md+O+xXxIi0OfJYRfMpYNeI2CkiugMnAvfkEIckSSUppTQ5pXR6v3798g5FktTFFL0HL6VUHxFnAvcD1cC1KaUXih2HJEmSJJWbXObgpZTuBe7N49qSJEmSVK5yedC5JEmSJKnjWeBJkiRJUpmwwJMkSZKkMmGBJ0mSJEllwgJPkiRJksqEBZ4kSZIklQkLPEmSJEkqE5FSyjuGjxURi4G5m/kxA4C3OyCcUud9yHgfMt6HjPch0xXuw44ppa1zjqFkdFB+hK7xd98VeB8y3oeM98F70KCr3Ic25ciSKPA6QkQ8nVKqyzuOvHkfMt6HjPch433IeB8ql3/3Ge9DxvuQ8T54DxqU2n1wiKYkSZIklQkLPEmSJEkqE5VU4E3MO4AuwvuQ8T5kvA8Z70PG+1C5/LvPeB8y3oeM98F70KCk7kPFzMGTJEmSpHJXST14kiRJklTWKqLAi4jREfFSRLwcEefnHU8eImJwRDwSETMj4oWIOCvvmPISEdURMSMi/ivvWPIUEVtGxO0R8WJEzIqIA/OOqdgi4geF/x+ej4ibI6Jn3jEVS0RcGxGLIuL5Rm1bRcSDETG78LN/njGq85kfzY/NmSPNjw0qNUeWQ34s+wIvIqqBK4EvAnsCX4uIPfONKhf1wDkppT2BA4DvVuh9ADgLmJV3EF3Ab4H7Ukq7AyOpsHsSEQOB7wN1KaURQDVwYr5RFdX1wOhmbecDD6WUdgUeKuyrTJkf1zM/NmWOrPD8CBWfI6+nxPNj2Rd4wGeAl1NKc1JKq4FbgLE5x1R0KaWFKaVnCq8/IPtlNTDfqIovIgYBXwKuzjuWPEVEP+BfgGsAUkqrU0pL8o0qF92A2ojoBvQC3sg5nqJJKT0GvNuseSxwQ+H1DcCxRQ1KxWZ+xPzYmDnS/NhMRebIcsiPlVDgDQTmNdqfT4X+4m4QEUOBfYG/5xtJLi4HzgPW5R1IznYCFgPXFYbiXB0RvfMOqphSSguAy4DXgYXA+ymlB/KNKnfbppQWFl6/CWybZzDqdObHZio8P4I5EsyPgDmyBSWVHyuhwFMjEdEHuAMYn1Jamnc8xRQRRwOLUkrT846lC+gGfAq4KqW0L7CMLj7coKMVxs+PJUvmOwC9I+Jf842q60jZEssus6yKUcn5EcyRjVR8fgRz5EcphfxYCQXeAmBwo/1BhbaKExE1ZMnrppTSnXnHk4ODgWMi4jWyoUiHRsSN+YaUm/nA/JRSw7fUt5MltEpyGPBqSmlxSmkNcCdwUM4x5e2tiNgeoPBzUc7xqHOZHwvMj4A5soH5MWOObKqk8mMlFHhPAbtGxE4R0Z1sgug9OcdUdBERZOPJZ6WUfpN3PHlIKf04pTQopTSU7L+Dh1NKFfltVErpTWBeRAwvNH0BmJljSHl4HTggInoV/v/4AhU4kb6Ze4BvFF5/A5iUYyzqfOZHzI8NzJEZ8+N65simSio/dss7gM6WUqqPiDOB+8lWALo2pfRCzmHl4WDg68BzEfFsoe2ClNK9OcakfH0PuKnwD7s5wCk5x1NUKaW/R8TtwDNkq+jNACbmG1XxRMTNwCHAgIiYD/wMuBT4U0R8E5gLfDW/CNXZzI/rmR/VXEXnR6jsHFkO+TGyYaSSJEmSpFJXCUM0JUmSJKkiWOBJkiRJUpmwwJMkSZKkMmGBJ0mSJEllwgJPkiRJksqEBZ4kSZIklQkLPKmDRMSHbTzv9ojYufD6tYh4LiL+JyKmRMSOm3H96yPiuBbab4mIXTf1cyVJAoiIwRHxakRsVdjvX9gfGhGHRMR/dfD1Ho2IujacNz4iTiq8vr4Q07MR8Y+I+MJmXP/kiLiihfYzI+LUTf1cqbNZ4ElFFBF7AdUppTmNmkellD4JPAr8pBMuexVwXid8riSpgqSU5pHllEsLTZcCE1NKr+UVU0R0A04F/tio+dyU0j7AeOD/dcJlryV7GLrUJVngSR2s8C3mo4Weuhcj4qaIiMLh/w1MauWtTwIDG33O3RExPSJeiIjTG7V/GBG/KHwzOTUitm0hhn8rfItZDTwOHFZIgpIkbY7/CxwQEeOBzwKXNT8hIj4dETMiYlhHXfQjct+hwDMppfoW3tY8r14YEU9FxPMRMbEhNxdy9q8iYlpE/DMiPtfC9b8UEU9GxICU0nLgtYj4TEf9+aSOZIEndY59yb453BPYGTi40H4wML2V94wG7m60f2pKaT+gDvh+RHyi0N4bmJpSGgk8BpzW+EMiYgKwNXBKSmltSmkd8DIwcrP/VJKkipZSWgOcS1bojS/srxcRB5H1mo1NKb3S7NjwwtDJlrYtP+bSreW+9uTVK1JKn04pjQBqgaMbHeuWUvoMWe7+WbO4xwHnA0ellN4uND8NbFQISl2B3+hLnWNaSmk+QEQ8CwwFngC2BxY3O/eRwnyGD4GfNmr/fiGpAAwGdgXeAVYDDfMcpgOHN3rPT4G/p5ROp6lFwA60ngQlSWqrLwILgRHAg43a9wAmAkeklN5o/qaU0kvAPpt4zdZy3/bArGbnToiIXwKDgAMbtY+KiPOAXsBWwAvA5MKxOxt99tBG7zmU7IvWI1JKSxu1LwJ238Q/i9Sp7MGTOseqRq/XsuHLlBVAz2bnjgJ2BJ4FLoZsmCdwGHBg4dvKGY3etyallFr4bICngP0aJsA30rNwbUmSNllE7ENWXB0A/CAitm90eCGwkmwUS0vv3ZwevNZyX0t59dyU0m7Aj8jmyxERPYH/AI5LKe0N/L7Z+xrydvO8+gqwBbBbs2uYV9VlWeBJxTUL2KV5Y2HuwHjgpEJx1g94L6W0PCJ2J0ukbXEf2aT3P0fEFo3adwOe36zIJUkVrTBn7SqyoZmvAxNoOgdvCfAl4JLCF5VNpJReSint08q2ZBPDajGvFlwBVEXEkWwo5t6OiD7ARqtOt2Iu8BXgPwsLpTUwr6rLssCTiuvPwCEtHUgpLQRuBr5LVqh1i4hZZAXb1LZeIKV0G9k3k/dERG1hIvqKlNKbmxm7JKmynQa8nlJqGJb5H8AeEfH5hhNSSm+RzW27MiL2L0JMfwH+paUDhR6//wOcVyggf09WlN1PNuKlTVJKL5ItknZbo4VjDqbp8FSpy4gNvd2SOltE1AKPAAenlNYW6Zo/AJamlK4pxvUkSSqmiLiLrIibXaTr7QucnVL6ejGuJ7WXPXhSEaWUVpCtzjXw487tQEuAG4p4PUmSiul8ssVWimUATRdFk7oUe/AkSZIkqUzYgydJkiRJZcICT5IkSZLKhAWeJEmSJJUJCzxJkiRJKhMWeJIkSZJUJv4/e1s+Le7gikAAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 1080x360 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.rcParams['figure.figsize']=[15, 5]\n",
"###########################################\n",
"plt.subplot(1, 2, 1)\n",
"plt.plot(lnrank, lnfreq, lw=2, color='b')\n",
"plt.title('ln (Rank) vs ln (Frecuencia) de palabras en español')\n",
"plt.xlabel('ln(Rank)')\n",
"plt.ylabel('ln(Freq)')\n",
"##############################################\n",
"bins = 80\n",
"histEsp = np.histogram(lnrank, bins)\n",
"hist_distEsp = stats.rv_histogram(histEsp)\n",
"plt.rcParams['figure.figsize']=[15, 5]\n",
"plt.subplot(1, 2, 2)\n",
"plt.title('Funcion de Densidad de Probabilidad (PDF) Xk vs PK')\n",
"plt.xlabel('Xk = ln(Rank)')\n",
"plt.ylabel('Pk = ln(Freq)')\n",
"#plt.xscale('log')\n",
"plt.yscale('log')\n",
"plt.plot(lnrank, hist_distEsp.pdf(lnfreq), lw=2, color='g', label='PDF_Español')\n",
"\n",
"\n",
"print('''Se puede observar que ambas graficas son muy parecidas, la grafica de la función de densidad de probabilidad no \n",
"tiene un comportamiento 'normal', esto debido a que la frecuencia va decreciendo en una proporcion inversa a \n",
"su lugar en el ranking, por ejemplo la frecuencia de palabra 'la' es 1/2 de la primera palabra 'de', \n",
"como se puede observar en la tabla de rank vs freq, y por ende las probabilidades van decrementando igual \n",
"forma, hasta llegar a practiamente cero en las ultimas palabras del ranking\n",
"\n",
"De forma experimental y probando varias veces, se hizo una variación del coeficiente 1.78*R de tal forma que\n",
"arrojara una suma de probabilidades cercana uno (1). Se encontró que el coeficiente original tambien funciona \n",
"para el idioma español, sin embargo si se quisisran tomar más decimales, se encontro una aproximación \n",
"de 1.78104 con el que la suma de p(k) dio un poco más cercana a 1.''')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0000003867761278"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"R=len(frec_esp)\n",
"pk_ingles=[]\n",
"pk_español=[]\n",
"\n",
"for r in range(1,R):\n",
" pk_español.append(1/ (r*np.log(1.78104*R)) )\n",
"\n",
"sum(pk_español)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
......
...@@ -4,8 +4,124 @@ ...@@ -4,8 +4,124 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# 3. Herramientas\n", "# 9. Procesamiento de Lenguaje Natural\n",
"## 3.1 Django (Instalacion, apps y vistas)" "\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"https://docs.google.com/presentation/d/1TYSbrhAfTvW8uuK1_nZiuoqgyOCkHxXrXCz9bVVP39Y/edit?usp=sharing\n",
"\n",
"\n",
"#### 9.1 Presentación de la Línea de investigación:\n",
" - Procesamiento de Lenguaje Natural\n",
" - Minería de Textos y Reconocimiento de patrones\n",
"\n",
"\n",
"#### 9.2 Presentación de dos estudios de caso:\n",
" - Recuperación, procesamiento y clasificación de tuits\n",
" - Reconocimiento de Entidades Nombradas Georeferenciables\n",
" \n",
"\n",
"#### 9.3 Instrumentos metodológicos:\n",
" - Datos y corpus lingüísticos como Instrumentos metodológicos de la Minería de Textos\n",
" - Técnicas de recolección de datos\n",
" - Repositorios\n",
"\t - Crawling\n",
" - Crowdsourcing\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejemplos "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"### 20 Newsgroups\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20newsgroups.data.html\n",
"\n",
"https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"### Entidades Nombradas Georeferenciables\n",
"\n",
"http://ner.geoint.mx/\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ejercicios"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Repositorios\n",
"\n",
"1. Generar una estructura de archivos y directorios similar a 20 Newsgroups pero con documentos en español y al menos cinco categorías\n",
"\n",
"2. Elejir y compilar al menos 100 documentos de cada categoría que serán utilizados en el proyecto final\n",
"\n",
"3. Subir el dataset generado al repositorio\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crowdsourcing\n",
"\n",
"1. Ingrese a la plataforma de anotación de misoginia (Crowdsourcing)\n",
"\n",
"2. Haga el tutorial de la plataforma\n",
"\n",
"3. Realice 100 anotaciones de tuits siguiendo las instrucciones\n",
"\n",
"http://etiquetamisoginia.geoint.mx/\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crawling\n",
"\n",
"REMERI es la Red Mexicana de Repositorios Institucionales\n",
"\n",
"El objetivo general de REMERI es integrar una red federada de Repositorios de acceso abierto de las Instituciones Mexicanas de Educación Superior (IES), con la finalidad de integrar, difundir, preservar y dar visibilidad a la producción científica, académica y documental del país.\n",
"\n",
"1. Ingrese a la plataforma REMERI y realice una búsqueda por palabra, por ejemplo: \"nopal\"\n",
"\n",
"http://www.remeri.org.mx/portal/REMERI.jsp?busca=nopal\n",
"\n",
"\n",
"2. Defina la Clase CrawlerRemeri() la cual tendra un método search(query, n=5) que realiza la búsqueda de la cadena query en REMERI y descarga n documentos resultantes de la búsqueda.\n",
"\n",
"3. modifique el método método search(query, n=5) para que cuando n sea negativo, descargue todos, los documentos resultantes de la búsqueda en REMERI\n",
"\n",
"\n",
"\n",
"#### Observaciones\n",
"\n",
"* utilice la biblioteca de python Requests para realizar las peticiones\n",
"* Sea cuidadoso ya que el sitio podría banear su IP en caso de que detecte un ataque"
] ]
}, },
{ {
...@@ -19,7 +135,1222 @@ ...@@ -19,7 +135,1222 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## 3.2 Django (Templates y HTML5)" "# 9. Procesamiento de Lenguaje Natural parte 2\n",
"\n",
"\n",
"## Contenido de la Presentación\n",
"\n",
"\n",
"https://docs.google.com/presentation/d/1WCVA9bMu12rfQDSg5guPXg6FkLgKmRsUM9aptMJ4Z1s/edit?usp=sharing\n",
"\n",
"\n",
" \n",
"#### 9.4 Codificación textual:\n",
"\n",
" - Bolsa de palabras\n",
" - Modelo Vectorial\n",
"\n",
"\n",
"#### 9.5 Similitud Textual con producto coseno\n",
"\n",
"\n",
"#### 9.6 Técnicas de Análisis de Textos con Aprendizaje Automático en estudios de caso:\n",
"\n",
"\n",
" - Clasificación\n",
" - Agrupamiento (Clustering)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modelo vectorial en sklearn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
" tokenizer=None, vocabulary=None)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer = CountVectorizer()\n",
"vectorizer "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<4x9 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 18 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# ejemplo de un modelo vectorial minimalista\n",
"\n",
"corpus = [\n",
" 'Este es el primer documento.',\n",
" 'Este es el segundo documento.',\n",
" 'Y el tercero.',\n",
" 'Acaso este es el primer elemento?',\n",
"]\n",
"X = vectorizer.fit_transform(corpus)\n",
"X \n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 1)\t1\n",
" (0, 6)\t1\n",
" (0, 2)\t1\n",
" (0, 4)\t1\n",
" (0, 5)\t1\n",
" (1, 7)\t1\n",
" (1, 1)\t1\n",
" (1, 2)\t1\n",
" (1, 4)\t1\n",
" (1, 5)\t1\n",
" (2, 8)\t1\n",
" (2, 2)\t1\n",
" (3, 3)\t1\n",
" (3, 0)\t1\n",
" (3, 6)\t1\n",
" (3, 2)\t1\n",
" (3, 4)\t1\n",
" (3, 5)\t1\n"
]
}
],
"source": [
"print(X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['este', 'es', 'un', 'documento', 'analizar']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"analyze = vectorizer.build_analyzer()\n",
"analyze(\"Este es un documento a analizar.\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['acaso',\n",
" 'documento',\n",
" 'el',\n",
" 'elemento',\n",
" 'es',\n",
" 'este',\n",
" 'primer',\n",
" 'segundo',\n",
" 'tercero']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.vocabulary_.get('documento')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['los',\n",
" 'ngramas',\n",
" 'son',\n",
" 'muy',\n",
" 'chidos',\n",
" 'los ngramas',\n",
" 'ngramas son',\n",
" 'son muy',\n",
" 'muy chidos',\n",
" 'los ngramas son',\n",
" 'ngramas son muy',\n",
" 'son muy chidos']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=4)\n",
"analyze = bigram_vectorizer.build_analyzer()\n",
"analyze('Los ngramas son muy chidos')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modelo vectorial tf-idf "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,\n",
" use_idf=True)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"transformer = TfidfTransformer(smooth_idf=False)\n",
"transformer \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Ejemplo usando conteos de palabras. La primera palabra está presente en el 100% de los documentos y por lo tanto, consideramos no muy importante."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.81940995, 0. , 0.57320793],\n",
" [1. , 0. , 0. ],\n",
" [1. , 0. , 0. ],\n",
" [1. , 0. , 0. ],\n",
" [0.47330339, 0.88089948, 0. ],\n",
" [0.58149261, 0. , 0.81355169]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts = [[3, 0, 1],\n",
" [2, 0, 0],\n",
" [3, 0, 0],\n",
" [4, 0, 0],\n",
" [3, 2, 0],\n",
" [3, 0, 2]]\n",
"\n",
"tfidf = transformer.fit_transform(counts)\n",
"tfidf \n",
"\n",
"\n",
"\n",
"tfidf.toarray() \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Como tf-idf es muy común para representar documentos, existe la clase \n",
"# TfidfVectorizer que tiene CountVectorizer y TfidfTransformer \n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer()\n",
"Xprima = vectorizer.fit_transform(corpus)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 5)\t0.4181266243877562\n",
" (0, 4)\t0.4181266243877562\n",
" (0, 2)\t0.3418459132932508\n",
" (0, 6)\t0.5164695651831305\n",
" (0, 1)\t0.5164695651831305\n",
" (1, 5)\t0.3878225151467608\n",
" (1, 4)\t0.3878225151467608\n",
" (1, 2)\t0.3170703183040649\n",
" (1, 1)\t0.4790379614294201\n",
" (1, 7)\t0.6075989123184679\n",
" (2, 2)\t0.46263733109032296\n",
" (2, 8)\t0.8865476297873808\n",
" (3, 5)\t0.3314387711719163\n",
" (3, 4)\t0.3314387711719163\n",
" (3, 2)\t0.2709729130450805\n",
" (3, 6)\t0.4093928203750212\n",
" (3, 0)\t0.519262881857229\n",
" (3, 3)\t0.519262881857229\n"
]
}
],
"source": [
"print(Xprima)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ejemplo completo Clasificación de 20 newsgroups dataset"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading 20news dataset. This may take a few minutes.\n",
"2019-04-01 20:56:53,543 INFO Downloading 20news dataset. This may take a few minutes.\n",
"Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n",
"2019-04-01 20:56:53,548 INFO Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Automatically created module for IPython interactive environment\n",
"Usage: ipykernel_launcher.py [options]\n",
"\n",
"Options:\n",
" -h, --help show this help message and exit\n",
" --report Print a detailed classification report.\n",
" --chi2_select=SELECT_CHI2\n",
" Select some number of features using a chi-squared\n",
" test\n",
" --confusion_matrix Print the confusion matrix.\n",
" --top10 Print ten most discriminative terms per class for\n",
" every classifier.\n",
" --all_categories Whether to use all categories or not.\n",
" --use_hashing Use a hashing vectorizer.\n",
" --n_features=N_FEATURES\n",
" n_features when using the hashing vectorizer.\n",
" --filtered Remove newsgroup information that is easily overfit:\n",
" headers, signatures, and quoting.\n",
"\n",
"Loading 20 newsgroups dataset for categories:\n",
"['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
"data loaded\n",
"2034 documents - 3.980MB (training set)\n",
"1353 documents - 2.867MB (test set)\n",
"4 categories\n",
"\n",
"Extracting features from the training data using a sparse vectorizer\n",
"done in 0.737494s at 5.396MB/s\n",
"n_samples: 2034, n_features: 33809\n",
"\n",
"Extracting features from the test data using the same vectorizer\n",
"done in 0.422445s at 6.788MB/s\n",
"n_samples: 1353, n_features: 33809\n",
"\n",
"================================================================================\n",
"Ridge Classifier\n",
"________________________________________________________________________________\n",
"Training: \n",
"RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,\n",
" max_iter=None, normalize=False, random_state=None, solver='sag',\n",
" tol=0.01)\n",
"train time: 0.235s\n",
"test time: 0.006s\n",
"accuracy: 0.896\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"================================================================================\n",
"Perceptron\n",
"________________________________________________________________________________\n",
"Training: \n",
"Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,\n",
" fit_intercept=True, max_iter=50, n_iter=None, n_iter_no_change=5,\n",
" n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.027s\n",
"test time: 0.003s\n",
"accuracy: 0.888\n",
"dimensionality: 33809\n",
"density: 0.240114\n",
"\n",
"\n",
"================================================================================\n",
"Passive-Aggressive\n",
"________________________________________________________________________________\n",
"Training: \n",
"PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,\n",
" early_stopping=False, fit_intercept=True, loss='hinge',\n",
" max_iter=50, n_iter=None, n_iter_no_change=5, n_jobs=None,\n",
" random_state=None, shuffle=True, tol=0.001,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.031s\n",
"test time: 0.003s\n",
"accuracy: 0.905\n",
"dimensionality: 33809\n",
"density: 0.716584\n",
"\n",
"\n",
"================================================================================\n",
"kNN\n",
"________________________________________________________________________________\n",
"Training: \n",
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=10, p=2,\n",
" weights='uniform')\n",
"train time: 0.005s\n",
"test time: 0.260s\n",
"accuracy: 0.858\n",
"\n",
"================================================================================\n",
"Random forest\n",
"________________________________________________________________________________\n",
"Training: \n",
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)\n",
"train time: 1.827s\n",
"test time: 0.154s\n",
"accuracy: 0.827\n",
"\n",
"================================================================================\n",
"L2 penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l2', random_state=None, tol=0.001,\n",
" verbose=0)\n",
"train time: 0.209s\n",
"test time: 0.002s\n",
"accuracy: 0.900\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.171s\n",
"test time: 0.002s\n",
"accuracy: 0.903\n",
"dimensionality: 33809\n",
"density: 0.664172\n",
"\n",
"\n",
"================================================================================\n",
"L1 penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
" verbose=0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:183: FutureWarning: max_iter and tol parameters have been added in SGDClassifier in 0.19. If max_iter is set but tol is left unset, the default value for tol in 0.19 and 0.20 will be None (which is equivalent to -infinity, so it has no effect) but will change in 0.21 to 1e-3. Specify tol to silence this warning.\n",
" FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"train time: 0.374s\n",
"test time: 0.004s\n",
"accuracy: 0.873\n",
"dimensionality: 33809\n",
"density: 0.005561\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.487s\n",
"test time: 0.002s\n",
"accuracy: 0.882\n",
"dimensionality: 33809\n",
"density: 0.020387\n",
"\n",
"\n",
"================================================================================\n",
"Elastic-Net penalty\n",
"________________________________________________________________________________\n",
"Training: \n",
"SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
" early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n",
" l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,\n",
" n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',\n",
" power_t=0.5, random_state=None, shuffle=True, tol=None,\n",
" validation_fraction=0.1, verbose=0, warm_start=False)\n",
"train time: 0.625s\n",
"test time: 0.004s\n",
"accuracy: 0.899\n",
"dimensionality: 33809\n",
"density: 0.188648\n",
"\n",
"\n",
"================================================================================\n",
"NearestCentroid (aka Rocchio classifier)\n",
"________________________________________________________________________________\n",
"Training: \n",
"NearestCentroid(metric='euclidean', shrink_threshold=None)\n",
"train time: 0.020s\n",
"test time: 0.005s\n",
"accuracy: 0.855\n",
"\n",
"================================================================================\n",
"Naive Bayes\n",
"________________________________________________________________________________\n",
"Training: \n",
"MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)\n",
"train time: 0.011s\n",
"test time: 0.002s\n",
"accuracy: 0.899\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)\n",
"train time: 0.014s\n",
"test time: 0.012s\n",
"accuracy: 0.884\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"________________________________________________________________________________\n",
"Training: \n",
"ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False)\n",
"train time: 0.012s\n",
"test time: 0.002s\n",
"accuracy: 0.911\n",
"dimensionality: 33809\n",
"density: 1.000000\n",
"\n",
"\n",
"================================================================================\n",
"LinearSVC with L1-based feature selection\n",
"________________________________________________________________________________\n",
"Training: \n",
"Pipeline(memory=None,\n",
" steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l1', random_state=None, tol=0.001,\n",
" verbose=0),\n",
" max_features=None, no...ax_iter=1000,\n",
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
" verbose=0))])\n",
"train time: 0.340s\n",
"test time: 0.005s\n",
"accuracy: 0.880\n",
"\n"
]
},
{
"data": {
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
"# Olivier Grisel <olivier.grisel@ensta.org>\n",
"# Mathieu Blondel <mathieu@mblondel.org>\n",
"# Lars Buitinck\n",
"# License: BSD 3 clause\n",
"\n",
"from __future__ import print_function\n",
"\n",
"import logging\n",
"import numpy as np\n",
"from optparse import OptionParser\n",
"import sys\n",
"from time import time\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.feature_selection import SelectKBest, chi2\n",
"from sklearn.linear_model import RidgeClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.linear_model import Perceptron\n",
"from sklearn.linear_model import PassiveAggressiveClassifier\n",
"from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.neighbors import NearestCentroid\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.utils.extmath import density\n",
"from sklearn import metrics\n",
"\n",
"\n",
"# Display progress logs on stdout\n",
"logging.basicConfig(level=logging.INFO,\n",
" format='%(asctime)s %(levelname)s %(message)s')\n",
"\n",
"\n",
"# parse commandline arguments\n",
"op = OptionParser()\n",
"op.add_option(\"--report\",\n",
" action=\"store_true\", dest=\"print_report\",\n",
" help=\"Print a detailed classification report.\")\n",
"op.add_option(\"--chi2_select\",\n",
" action=\"store\", type=\"int\", dest=\"select_chi2\",\n",
" help=\"Select some number of features using a chi-squared test\")\n",
"op.add_option(\"--confusion_matrix\",\n",
" action=\"store_true\", dest=\"print_cm\",\n",
" help=\"Print the confusion matrix.\")\n",
"op.add_option(\"--top10\",\n",
" action=\"store_true\", dest=\"print_top10\",\n",
" help=\"Print ten most discriminative terms per class\"\n",
" \" for every classifier.\")\n",
"op.add_option(\"--all_categories\",\n",
" action=\"store_true\", dest=\"all_categories\",\n",
" help=\"Whether to use all categories or not.\")\n",
"op.add_option(\"--use_hashing\",\n",
" action=\"store_true\",\n",
" help=\"Use a hashing vectorizer.\")\n",
"op.add_option(\"--n_features\",\n",
" action=\"store\", type=int, default=2 ** 16,\n",
" help=\"n_features when using the hashing vectorizer.\")\n",
"op.add_option(\"--filtered\",\n",
" action=\"store_true\",\n",
" help=\"Remove newsgroup information that is easily overfit: \"\n",
" \"headers, signatures, and quoting.\")\n",
"\n",
"\n",
"def is_interactive():\n",
" return not hasattr(sys.modules['__main__'], '__file__')\n",
"\n",
"\n",
"# work-around for Jupyter notebook and IPython console\n",
"argv = [] if is_interactive() else sys.argv[1:]\n",
"(opts, args) = op.parse_args(argv)\n",
"if len(args) > 0:\n",
" op.error(\"this script takes no arguments.\")\n",
" sys.exit(1)\n",
"\n",
"print(__doc__)\n",
"op.print_help()\n",
"print()\n",
"\n",
"\n",
"# #############################################################################\n",
"# Load some categories from the training set\n",
"if opts.all_categories:\n",
" categories = None\n",
"else:\n",
" categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
" ]\n",
"\n",
"if opts.filtered:\n",
" remove = ('headers', 'footers', 'quotes')\n",
"else:\n",
" remove = ()\n",
"\n",
"print(\"Loading 20 newsgroups dataset for categories:\")\n",
"print(categories if categories else \"all\")\n",
"\n",
"data_train = fetch_20newsgroups(subset='train', categories=categories,\n",
" shuffle=True, random_state=42,\n",
" remove=remove)\n",
"\n",
"data_test = fetch_20newsgroups(subset='test', categories=categories,\n",
" shuffle=True, random_state=42,\n",
" remove=remove)\n",
"print('data loaded')\n",
"\n",
"# order of labels in `target_names` can be different from `categories`\n",
"target_names = data_train.target_names\n",
"\n",
"\n",
"def size_mb(docs):\n",
" return sum(len(s.encode('utf-8')) for s in docs) / 1e6\n",
"\n",
"\n",
"data_train_size_mb = size_mb(data_train.data)\n",
"data_test_size_mb = size_mb(data_test.data)\n",
"\n",
"print(\"%d documents - %0.3fMB (training set)\" % (\n",
" len(data_train.data), data_train_size_mb))\n",
"print(\"%d documents - %0.3fMB (test set)\" % (\n",
" len(data_test.data), data_test_size_mb))\n",
"print(\"%d categories\" % len(target_names))\n",
"print()\n",
"\n",
"# split a training set and a test set\n",
"y_train, y_test = data_train.target, data_test.target\n",
"\n",
"print(\"Extracting features from the training data using a sparse vectorizer\")\n",
"t0 = time()\n",
"if opts.use_hashing:\n",
" vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,\n",
" n_features=opts.n_features)\n",
" X_train = vectorizer.transform(data_train.data)\n",
"else:\n",
" vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n",
" stop_words='english')\n",
" X_train = vectorizer.fit_transform(data_train.data)\n",
"duration = time() - t0\n",
"print(\"done in %fs at %0.3fMB/s\" % (duration, data_train_size_mb / duration))\n",
"print(\"n_samples: %d, n_features: %d\" % X_train.shape)\n",
"print()\n",
"\n",
"print(\"Extracting features from the test data using the same vectorizer\")\n",
"t0 = time()\n",
"X_test = vectorizer.transform(data_test.data)\n",
"duration = time() - t0\n",
"print(\"done in %fs at %0.3fMB/s\" % (duration, data_test_size_mb / duration))\n",
"print(\"n_samples: %d, n_features: %d\" % X_test.shape)\n",
"print()\n",
"\n",
"# mapping from integer feature name to original token string\n",
"if opts.use_hashing:\n",
" feature_names = None\n",
"else:\n",
" feature_names = vectorizer.get_feature_names()\n",
"\n",
"if opts.select_chi2:\n",
" print(\"Extracting %d best features by a chi-squared test\" %\n",
" opts.select_chi2)\n",
" t0 = time()\n",
" ch2 = SelectKBest(chi2, k=opts.select_chi2)\n",
" X_train = ch2.fit_transform(X_train, y_train)\n",
" X_test = ch2.transform(X_test)\n",
" if feature_names:\n",
" # keep selected feature names\n",
" feature_names = [feature_names[i] for i\n",
" in ch2.get_support(indices=True)]\n",
" print(\"done in %fs\" % (time() - t0))\n",
" print()\n",
"\n",
"if feature_names:\n",
" feature_names = np.asarray(feature_names)\n",
"\n",
"\n",
"def trim(s):\n",
" \"\"\"Trim string to fit on terminal (assuming 80-column display)\"\"\"\n",
" return s if len(s) <= 80 else s[:77] + \"...\"\n",
"\n",
"\n",
"# #############################################################################\n",
"# Benchmark classifiers\n",
"def benchmark(clf):\n",
" print('_' * 80)\n",
" print(\"Training: \")\n",
" print(clf)\n",
" t0 = time()\n",
" clf.fit(X_train, y_train)\n",
" train_time = time() - t0\n",
" print(\"train time: %0.3fs\" % train_time)\n",
"\n",
" t0 = time()\n",
" pred = clf.predict(X_test)\n",
" test_time = time() - t0\n",
" print(\"test time: %0.3fs\" % test_time)\n",
"\n",
" score = metrics.accuracy_score(y_test, pred)\n",
" print(\"accuracy: %0.3f\" % score)\n",
"\n",
" if hasattr(clf, 'coef_'):\n",
" print(\"dimensionality: %d\" % clf.coef_.shape[1])\n",
" print(\"density: %f\" % density(clf.coef_))\n",
"\n",
" if opts.print_top10 and feature_names is not None:\n",
" print(\"top 10 keywords per class:\")\n",
" for i, label in enumerate(target_names):\n",
" top10 = np.argsort(clf.coef_[i])[-10:]\n",
" print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n",
" print()\n",
"\n",
" if opts.print_report:\n",
" print(\"classification report:\")\n",
" print(metrics.classification_report(y_test, pred,\n",
" target_names=target_names))\n",
"\n",
" if opts.print_cm:\n",
" print(\"confusion matrix:\")\n",
" print(metrics.confusion_matrix(y_test, pred))\n",
"\n",
" print()\n",
" clf_descr = str(clf).split('(')[0]\n",
" return clf_descr, score, train_time, test_time\n",
"\n",
"\n",
"results = []\n",
"for clf, name in (\n",
" (RidgeClassifier(tol=1e-2, solver=\"sag\"), \"Ridge Classifier\"),\n",
" (Perceptron(max_iter=50, tol=1e-3), \"Perceptron\"),\n",
" (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),\n",
" \"Passive-Aggressive\"),\n",
" (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n",
" (RandomForestClassifier(n_estimators=100), \"Random forest\")):\n",
" print('=' * 80)\n",
" print(name)\n",
" results.append(benchmark(clf))\n",
"\n",
"for penalty in [\"l2\", \"l1\"]:\n",
" print('=' * 80)\n",
" print(\"%s penalty\" % penalty.upper())\n",
" # Train Liblinear model\n",
" results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n",
" tol=1e-3)))\n",
"\n",
" # Train SGD model\n",
" results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
" penalty=penalty)))\n",
"\n",
"# Train SGD with Elastic Net penalty\n",
"print('=' * 80)\n",
"print(\"Elastic-Net penalty\")\n",
"results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n",
" penalty=\"elasticnet\")))\n",
"\n",
"# Train NearestCentroid without threshold\n",
"print('=' * 80)\n",
"print(\"NearestCentroid (aka Rocchio classifier)\")\n",
"results.append(benchmark(NearestCentroid()))\n",
"\n",
"# Train sparse Naive Bayes classifiers\n",
"print('=' * 80)\n",
"print(\"Naive Bayes\")\n",
"results.append(benchmark(MultinomialNB(alpha=.01)))\n",
"results.append(benchmark(BernoulliNB(alpha=.01)))\n",
"results.append(benchmark(ComplementNB(alpha=.1)))\n",
"\n",
"print('=' * 80)\n",
"print(\"LinearSVC with L1-based feature selection\")\n",
"# The smaller C, the stronger the regularization.\n",
"# The more regularization, the more sparsity.\n",
"results.append(benchmark(Pipeline([\n",
" ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n",
" tol=1e-3))),\n",
" ('classification', LinearSVC(penalty=\"l2\"))])))\n",
"\n",
"# make some plots\n",
"\n",
"indices = np.arange(len(results))\n",
"\n",
"results = [[x[i] for x in results] for i in range(4)]\n",
"\n",
"clf_names, score, training_time, test_time = results\n",
"training_time = np.array(training_time) / np.max(training_time)\n",
"test_time = np.array(test_time) / np.max(test_time)\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"plt.title(\"Score\")\n",
"plt.barh(indices, score, .2, label=\"score\", color='navy')\n",
"plt.barh(indices + .3, training_time, .2, label=\"training time\",\n",
" color='c')\n",
"plt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\n",
"plt.yticks(())\n",
"plt.legend(loc='best')\n",
"plt.subplots_adjust(left=.25)\n",
"plt.subplots_adjust(top=.95)\n",
"plt.subplots_adjust(bottom=.05)\n",
"\n",
"for i, c in zip(indices, clf_names):\n",
" plt.text(-.3, i, c)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ejemplo completo Clustering"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Automatically created module for IPython interactive environment\n",
"Usage: ipykernel_launcher.py [options]\n",
"\n",
"Options:\n",
" -h, --help show this help message and exit\n",
" --lsa=N_COMPONENTS Preprocess documents with latent semantic analysis.\n",
" --no-minibatch Use ordinary k-means algorithm (in batch mode).\n",
" --no-idf Disable Inverse Document Frequency feature weighting.\n",
" --use-hashing Use a hashing feature vectorizer\n",
" --n-features=N_FEATURES\n",
" Maximum number of features (dimensions) to extract\n",
" from text.\n",
" --verbose Print progress reports inside k-means algorithm.\n",
"Loading 20 newsgroups dataset for categories:\n",
"['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']\n",
"3387 documents\n",
"4 categories\n",
"\n",
"Extracting features from the training dataset using a sparse vectorizer\n",
"done in 1.281258s\n",
"n_samples: 3387, n_features: 10000\n",
"\n",
"Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',\n",
" init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,\n",
" n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,\n",
" verbose=False)\n",
"done in 0.094s\n",
"\n",
"Homogeneity: 0.596\n",
"Completeness: 0.651\n",
"V-measure: 0.623\n",
"Adjusted Rand-Index: 0.569\n",
"Silhouette Coefficient: 0.008\n",
"\n",
"Top terms per cluster:\n",
"Cluster 0: graphics image university thanks com files file 3d ac posting\n",
"Cluster 1: access digex henry pat toronto net com hst prb zoo\n",
"Cluster 2: space nasa gov alaska moon launch com shuttle just like\n",
"Cluster 3: god com people sandvik article don jesus say keith christian\n"
]
}
],
"source": [
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n",
"# Lars Buitinck\n",
"# License: BSD 3 clause\n",
"\n",
"from __future__ import print_function\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import Normalizer\n",
"from sklearn import metrics\n",
"\n",
"from sklearn.cluster import KMeans, MiniBatchKMeans\n",
"\n",
"import logging\n",
"from optparse import OptionParser\n",
"import sys\n",
"from time import time\n",
"\n",
"import numpy as np\n",
"\n",
"\n",
"# Display progress logs on stdout\n",
"logging.basicConfig(level=logging.INFO,\n",
" format='%(asctime)s %(levelname)s %(message)s')\n",
"\n",
"# parse commandline arguments\n",
"op = OptionParser()\n",
"op.add_option(\"--lsa\",\n",
" dest=\"n_components\", type=\"int\",\n",
" help=\"Preprocess documents with latent semantic analysis.\")\n",
"op.add_option(\"--no-minibatch\",\n",
" action=\"store_false\", dest=\"minibatch\", default=True,\n",
" help=\"Use ordinary k-means algorithm (in batch mode).\")\n",
"op.add_option(\"--no-idf\",\n",
" action=\"store_false\", dest=\"use_idf\", default=True,\n",
" help=\"Disable Inverse Document Frequency feature weighting.\")\n",
"op.add_option(\"--use-hashing\",\n",
" action=\"store_true\", default=False,\n",
" help=\"Use a hashing feature vectorizer\")\n",
"op.add_option(\"--n-features\", type=int, default=10000,\n",
" help=\"Maximum number of features (dimensions)\"\n",
" \" to extract from text.\")\n",
"op.add_option(\"--verbose\",\n",
" action=\"store_true\", dest=\"verbose\", default=False,\n",
" help=\"Print progress reports inside k-means algorithm.\")\n",
"\n",
"print(__doc__)\n",
"op.print_help()\n",
"\n",
"\n",
"def is_interactive():\n",
" return not hasattr(sys.modules['__main__'], '__file__')\n",
"\n",
"\n",
"# work-around for Jupyter notebook and IPython console\n",
"argv = [] if is_interactive() else sys.argv[1:]\n",
"(opts, args) = op.parse_args(argv)\n",
"if len(args) > 0:\n",
" op.error(\"this script takes no arguments.\")\n",
" sys.exit(1)\n",
"\n",
"\n",
"# #############################################################################\n",
"# Load some categories from the training set\n",
"categories = [\n",
" 'alt.atheism',\n",
" 'talk.religion.misc',\n",
" 'comp.graphics',\n",
" 'sci.space',\n",
"]\n",
"# Uncomment the following to do the analysis on all the categories\n",
"# categories = None\n",
"\n",
"print(\"Loading 20 newsgroups dataset for categories:\")\n",
"print(categories)\n",
"\n",
"dataset = fetch_20newsgroups(subset='all', categories=categories,\n",
" shuffle=True, random_state=42)\n",
"\n",
"print(\"%d documents\" % len(dataset.data))\n",
"print(\"%d categories\" % len(dataset.target_names))\n",
"print()\n",
"\n",
"labels = dataset.target\n",
"true_k = np.unique(labels).shape[0]\n",
"\n",
"print(\"Extracting features from the training dataset \"\n",
" \"using a sparse vectorizer\")\n",
"t0 = time()\n",
"if opts.use_hashing:\n",
" if opts.use_idf:\n",
" # Perform an IDF normalization on the output of HashingVectorizer\n",
" hasher = HashingVectorizer(n_features=opts.n_features,\n",
" stop_words='english', alternate_sign=False,\n",
" norm=None, binary=False)\n",
" vectorizer = make_pipeline(hasher, TfidfTransformer())\n",
" else:\n",
" vectorizer = HashingVectorizer(n_features=opts.n_features,\n",
" stop_words='english',\n",
" alternate_sign=False, norm='l2',\n",
" binary=False)\n",
"else:\n",
" vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n",
" min_df=2, stop_words='english',\n",
" use_idf=opts.use_idf)\n",
"X = vectorizer.fit_transform(dataset.data)\n",
"\n",
"print(\"done in %fs\" % (time() - t0))\n",
"print(\"n_samples: %d, n_features: %d\" % X.shape)\n",
"print()\n",
"\n",
"if opts.n_components:\n",
" print(\"Performing dimensionality reduction using LSA\")\n",
" t0 = time()\n",
" # Vectorizer results are normalized, which makes KMeans behave as\n",
" # spherical k-means for better results. Since LSA/SVD results are\n",
" # not normalized, we have to redo the normalization.\n",
" svd = TruncatedSVD(opts.n_components)\n",
" normalizer = Normalizer(copy=False)\n",
" lsa = make_pipeline(svd, normalizer)\n",
"\n",
" X = lsa.fit_transform(X)\n",
"\n",
" print(\"done in %fs\" % (time() - t0))\n",
"\n",
" explained_variance = svd.explained_variance_ratio_.sum()\n",
" print(\"Explained variance of the SVD step: {}%\".format(\n",
" int(explained_variance * 100)))\n",
"\n",
" print()\n",
"\n",
"\n",
"# #############################################################################\n",
"# Do the actual clustering\n",
"\n",
"if opts.minibatch:\n",
" km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n",
" init_size=1000, batch_size=1000, verbose=opts.verbose)\n",
"else:\n",
" km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n",
" verbose=opts.verbose)\n",
"\n",
"print(\"Clustering sparse data with %s\" % km)\n",
"t0 = time()\n",
"km.fit(X)\n",
"print(\"done in %0.3fs\" % (time() - t0))\n",
"print()\n",
"\n",
"print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n",
"print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n",
"print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n",
"print(\"Adjusted Rand-Index: %.3f\"\n",
" % metrics.adjusted_rand_score(labels, km.labels_))\n",
"print(\"Silhouette Coefficient: %0.3f\"\n",
" % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n",
"\n",
"print()\n",
"\n",
"\n",
"if not opts.use_hashing:\n",
" print(\"Top terms per cluster:\")\n",
"\n",
" if opts.n_components:\n",
" original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n",
" order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
" else:\n",
" order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n",
"\n",
" terms = vectorizer.get_feature_names()\n",
" for i in range(true_k):\n",
" print(\"Cluster %d:\" % i, end='')\n",
" for ind in order_centroids[i, :10]:\n",
" print(' %s' % terms[ind], end='')\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Referencias \n",
"\n",
"Pérez C. et al. (2016). Recuperación, procesamiento y clasificación de tuits para visualizar estructuras de interacción. Research in Computing Science Journal, 124 (1), 23-37. http://www.rcs.cic.ipn.mx/2016_124/Recuperacion_%20procesamiento%20y%20clasificacion%20de%20tuits%20para%20visualizar%20estructuras%20de%20interaccion.pdf\n",
"\n",
"\n",
"T. Joachims (1996). A probabilistic analysis of the Rocchio algorithm with TFIDF for text categorization, Computer Science Technical Report CMU-CS-96-118. Carnegie Mellon University.\n",
"http://rexa.info/paper/7c077ad01b1a7f0605ca075ead0193d4555c2619\n",
"\n",
"\n"
] ]
}, },
{ {
...@@ -46,7 +1377,7 @@ ...@@ -46,7 +1377,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.8rc1" "version": "3.7.1"
} }
}, },
"nbformat": 4, "nbformat": 4,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment