Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
tap1012
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Mario Chirinos Colunga
tap1012
Commits
bc1ba6a1
Commit
bc1ba6a1
authored
Apr 15, 2019
by
Carlos Manuel Chable Jimenez
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
4f3efc3b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1222 additions
and
0 deletions
+1222
-0
periodicointeligente.ipynb
periodicointeligente.ipynb
+1222
-0
No files found.
periodicointeligente.ipynb
0 → 100644
View file @
bc1ba6a1
{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# Leer todo los docmunetos en el directorio periodico y subdirectorios\n",
"# que estan almacenando los CSV que se descargaron Cultura, deportes,espectaculos,politica\n",
"# Cada artículo está almacenado en un fichero de texto dentro de un directorio llamado igual\n",
"# que la categoría a la que pertenece el artículo.\n",
"\n",
"import os\n",
"\n",
"def read_all_documents(root):\n",
" labels = []\n",
" docs = []\n",
" for r, dirs, files in os.walk(root):\n",
" for file in files:\n",
" with open(os.path.join(r, file), \"r\" , encoding=\"utf8\") as f:\n",
" docs.append(f.read()) \n",
" labels.append(r.replace(root, ''))\n",
" return dict([('docs', docs), ('labels', labels)])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"# Creamos nuestra matriz tf-idf con el conjunto de artículos que tenemos del directorio\n",
"data = read_all_documents(r'C:\\Users\\cchable\\Documents\\Tap1012\\molina\\E00\\E00\\text-classifier-master\\periodico')\n",
"documents = data['docs']\n",
"labels = data['labels']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['0',\n",
" '00',\n",
" '000',\n",
" '003n1pol',\n",
" '003n2pol',\n",
" '004n1pol',\n",
" '004n2pol',\n",
" '005n1pol',\n",
" '007n2pol',\n",
" '008n1pol',\n",
" '009n1pol',\n",
" '01',\n",
" '010n1pol',\n",
" '010n2pol',\n",
" '011n1pol',\n",
" '014n1pol',\n",
" '02',\n",
" '03',\n",
" '04',\n",
" '05',\n",
" '06',\n",
" '06700',\n",
" '07',\n",
" '08',\n",
" '09',\n",
" '1',\n",
" '10',\n",
" '100',\n",
" '1000',\n",
" '1007',\n",
" '11',\n",
" '110',\n",
" '111',\n",
" '112',\n",
" '11320',\n",
" '12',\n",
" '122',\n",
" '123',\n",
" '1260',\n",
" '13',\n",
" '130',\n",
" '137',\n",
" '14',\n",
" '140',\n",
" '147',\n",
" '15',\n",
" '150',\n",
" '152',\n",
" '1531',\n",
" '16',\n",
" '160',\n",
" '17',\n",
" '1798',\n",
" '18',\n",
" '180',\n",
" '1800',\n",
" '1850',\n",
" '1863',\n",
" '1866',\n",
" '1875',\n",
" '1877',\n",
" '1878',\n",
" '1879',\n",
" '1883',\n",
" '1885',\n",
" '1891',\n",
" '1895',\n",
" '19',\n",
" '1901',\n",
" '1917',\n",
" '1921',\n",
" '1922',\n",
" '1923',\n",
" '1928',\n",
" '1930',\n",
" '1934',\n",
" '1935',\n",
" '1936',\n",
" '1938',\n",
" '1939',\n",
" '1942',\n",
" '1943',\n",
" '1944',\n",
" '1945',\n",
" '1949',\n",
" '1950',\n",
" '1952',\n",
" '1953',\n",
" '1955',\n",
" '1958',\n",
" '1960',\n",
" '1962',\n",
" '1963',\n",
" '1964',\n",
" '1966',\n",
" '1967',\n",
" '1968',\n",
" '1969',\n",
" '1970',\n",
" '1971',\n",
" '1972',\n",
" '1973',\n",
" '1974',\n",
" '1977',\n",
" '1978',\n",
" '1979',\n",
" '1980',\n",
" '1981',\n",
" '1982',\n",
" '1983',\n",
" '1984',\n",
" '1985',\n",
" '1986',\n",
" '1987',\n",
" '1988',\n",
" '1989',\n",
" '1990',\n",
" '1993',\n",
" '1994',\n",
" '1995',\n",
" '1997',\n",
" '1998',\n",
" '1999',\n",
" '1er',\n",
" '1ª',\n",
" '1º',\n",
" '2',\n",
" '20',\n",
" '200',\n",
" '2000',\n",
" '2002',\n",
" '2003',\n",
" '2004',\n",
" '2005',\n",
" '2006',\n",
" '2007',\n",
" '2008',\n",
" '2009',\n",
" '2010',\n",
" '2011',\n",
" '2012',\n",
" '2013',\n",
" '2014',\n",
" '2015',\n",
" '2016',\n",
" '2017',\n",
" '2018',\n",
" '2019',\n",
" '2020',\n",
" '2021',\n",
" '2022',\n",
" '2026',\n",
" '21',\n",
" '22',\n",
" '225',\n",
" '23',\n",
" '235',\n",
" '235b2c',\n",
" '24',\n",
" '240',\n",
" '24h',\n",
" '25',\n",
" '250',\n",
" '251',\n",
" '26',\n",
" '27',\n",
" '270821',\n",
" '270850',\n",
" '270938',\n",
" '271088',\n",
" '271090',\n",
" '274452',\n",
" '274761',\n",
" '275576',\n",
" '276294',\n",
" '276551',\n",
" '276671',\n",
" '277070',\n",
" '278423',\n",
" '278975',\n",
" '278981',\n",
" '279097',\n",
" '279231',\n",
" '279929',\n",
" '28',\n",
" '281475',\n",
" '283',\n",
" '29',\n",
" '296',\n",
" '2ª',\n",
" '3',\n",
" '30',\n",
" '300',\n",
" '3000',\n",
" '3000894',\n",
" '303',\n",
" '308',\n",
" '31',\n",
" '3111668',\n",
" '314',\n",
" '318',\n",
" '3220093',\n",
" '3231726',\n",
" '3235087',\n",
" '3267043',\n",
" '3267639',\n",
" '3269488',\n",
" '33',\n",
" '34',\n",
" '340',\n",
" '342',\n",
" '35',\n",
" '350',\n",
" '36',\n",
" '360',\n",
" '37',\n",
" '38',\n",
" '39',\n",
" '3er',\n",
" '3kids',\n",
" '3ª',\n",
" '4',\n",
" '40',\n",
" '41',\n",
" '411',\n",
" '42',\n",
" '43',\n",
" '45',\n",
" '46',\n",
" '477',\n",
" '48',\n",
" '49er',\n",
" '49erfx',\n",
" '4t',\n",
" '4ª',\n",
" '5',\n",
" '50',\n",
" '500',\n",
" '51',\n",
" '52',\n",
" '520',\n",
" '53',\n",
" '540',\n",
" '54021',\n",
" '55',\n",
" '564',\n",
" '565',\n",
" '58',\n",
" '582',\n",
" '59',\n",
" '5ª',\n",
" '6',\n",
" '60',\n",
" '600',\n",
" '61',\n",
" '650',\n",
" '67',\n",
" '68',\n",
" '69',\n",
" '6h',\n",
" '6ª',\n",
" '7',\n",
" '70',\n",
" '71',\n",
" '72',\n",
" '73',\n",
" '75',\n",
" '76ta',\n",
" '79',\n",
" '7ª',\n",
" '8',\n",
" '80',\n",
" '800',\n",
" '85',\n",
" '880',\n",
" '89',\n",
" '894',\n",
" '9',\n",
" '90',\n",
" '900',\n",
" '901',\n",
" '91',\n",
" '91a',\n",
" '934',\n",
" '94',\n",
" '942',\n",
" '97',\n",
" '98',\n",
" 'a',\n",
" 'a03n1cul',\n",
" 'a03n2cul',\n",
" 'a04n1cul',\n",
" 'a04n3cul',\n",
" 'a05n1cul',\n",
" 'a12n1dis',\n",
" 'a12n1vox',\n",
" 'aare',\n",
" 'aarhus',\n",
" 'aaron',\n",
" 'aarón',\n",
" 'abajo',\n",
" 'abandona',\n",
" 'abandonaba',\n",
" 'abandonarán',\n",
" 'abandonen',\n",
" 'abandono',\n",
" 'abandonó',\n",
" 'abarca',\n",
" 'abarcar',\n",
" 'abatir',\n",
" 'abierta',\n",
" 'abiertas',\n",
" 'abierto',\n",
" 'abiertos',\n",
" 'abigail',\n",
" 'abimerhi',\n",
" 'abogadillo',\n",
" 'abogados',\n",
" 'aborda',\n",
" 'abordada',\n",
" 'abordado',\n",
" 'abordan',\n",
" 'abordar',\n",
" 'abordarse',\n",
" 'abordará',\n",
" 'abordó',\n",
" 'aborigen',\n",
" 'aborto',\n",
" 'aborígenes',\n",
" 'abraham',\n",
" 'abrazos',\n",
" 'abren',\n",
" 'abreviatura',\n",
" 'abriera',\n",
" 'abrieron',\n",
" 'abril',\n",
" 'abrir',\n",
" 'abrirme',\n",
" 'abrirá',\n",
" 'abrirán',\n",
" 'abrió',\n",
" 'abroga',\n",
" 'abrogación',\n",
" 'abrogada',\n",
" 'abrogar',\n",
" 'abrogarla',\n",
" 'abrogará',\n",
" 'abrogue',\n",
" 'abrumadoras',\n",
" 'abruptamente',\n",
" 'absalón',\n",
" 'absolutamente',\n",
" 'absolutas',\n",
" 'absoluto',\n",
" 'abstenciones',\n",
" 'abstenido',\n",
" 'abstracto',\n",
" 'abu',\n",
" 'abundantes',\n",
" 'abundó',\n",
" 'aburrida',\n",
" 'abusando',\n",
" 'abusivas',\n",
" 'ac',\n",
" 'acaba',\n",
" 'acabamos',\n",
" 'acabando',\n",
" 'acabar',\n",
" 'acabe',\n",
" 'acabo',\n",
" 'acabó',\n",
" 'academia',\n",
" 'academias',\n",
" 'académicas',\n",
" 'académico',\n",
" 'académicos',\n",
" 'acapulco',\n",
" 'acceder',\n",
" 'accelerate',\n",
" 'acceso',\n",
" 'accesos',\n",
" 'accidentado',\n",
" 'accidente',\n",
" 'acciones',\n",
" 'acción',\n",
" 'acentuada',\n",
" 'acepta',\n",
" 'aceptación',\n",
" 'aceptado',\n",
" 'aceptar',\n",
" 'aceptaron',\n",
" 'aceptará',\n",
" 'acepto',\n",
" 'acerca',\n",
" 'acercamiento',\n",
" 'acercamos',\n",
" 'acercan',\n",
" 'acercar',\n",
" 'acercarnos',\n",
" 'acercarse',\n",
" 'acercó',\n",
" 'acero',\n",
" 'acerque',\n",
" 'acervo',\n",
" 'acervos',\n",
" 'achicarse',\n",
" 'achievement',\n",
" 'acierto',\n",
" 'aclararon',\n",
" 'aclaró',\n",
" 'acogerá',\n",
" 'acogida',\n",
" 'acogido',\n",
" 'acogieron',\n",
" 'acomodarlos',\n",
" 'acompaña',\n",
" 'acompañada',\n",
" 'acompañado',\n",
" 'acompañan',\n",
" 'acompañaron',\n",
" 'acompañen',\n",
" 'acondicionar',\n",
" 'aconseja',\n",
" 'acontecer',\n",
" 'acontecimiento',\n",
" 'acontecimientos',\n",
" 'acordado',\n",
" 'acordamos',\n",
" 'acordaron',\n",
" 'acordarte',\n",
" 'acorde',\n",
" 'acordonando',\n",
" 'acordó',\n",
" 'acorta',\n",
" 'acostumbrada',\n",
" 'acostumbrado',\n",
" 'acostumbrados',\n",
" 'acostumbran',\n",
" 'acotó',\n",
" 'acrecentar',\n",
" 'acreditaban',\n",
" 'acres',\n",
" 'acrobacias',\n",
" 'actitud',\n",
" 'activaciones',\n",
" 'activación',\n",
" 'actividad',\n",
" 'actividades',\n",
" 'activismo',\n",
" 'activistas',\n",
" 'activo',\n",
" 'activos',\n",
" 'acto',\n",
" 'actor',\n",
" 'actoral',\n",
" 'actorales',\n",
" 'actores',\n",
" 'actos',\n",
" 'actrices',\n",
" 'actriz',\n",
" 'actuaciones',\n",
" 'actuación',\n",
" 'actuado',\n",
" 'actual',\n",
" 'actuales',\n",
" 'actualidad',\n",
" 'actualización',\n",
" 'actualmente',\n",
" 'actuando',\n",
" 'actuar',\n",
" 'actuarán',\n",
" 'actuó',\n",
" 'actúa',\n",
" 'actúan',\n",
" 'acu',\n",
" 'acuarelas',\n",
" 'acuda',\n",
" 'acuden',\n",
" 'acudieron',\n",
" 'acudir',\n",
" 'acudirán',\n",
" 'acudió',\n",
" 'acuerdo',\n",
" 'acuerdos',\n",
" 'acuetzpallin',\n",
" 'acufi',\n",
" 'acumula',\n",
" 'acumulamos',\n",
" 'acumulan',\n",
" 'acusaciones',\n",
" 'acusadoras',\n",
" 'acusara',\n",
" 'acuático',\n",
" 'adam',\n",
" 'adaptación',\n",
" 'adaptada',\n",
" 'adaptado',\n",
" 'adaptando',\n",
" 'adaptarme',\n",
" 'adaptarse',\n",
" 'adapte',\n",
" 'addy',\n",
" 'adecuaciones',\n",
" 'adecuada',\n",
" 'adecuadas',\n",
" 'adela',\n",
" 'adelantado',\n",
" 'adelantara',\n",
" 'adelantaron',\n",
" 'adelante',\n",
" 'adelantó',\n",
" 'además',\n",
" 'adenda',\n",
" 'adentra',\n",
" 'adentrarse',\n",
" 'adentrará',\n",
" 'adentro',\n",
" 'adeptos',\n",
" 'adeudos',\n",
" 'adicional',\n",
" 'adictos',\n",
" 'adiós',\n",
" 'adjudicarse',\n",
" 'adjunto',\n",
" 'adler',\n",
" 'administra',\n",
" 'administraciones',\n",
" 'administración',\n",
" 'administrativa',\n",
" 'administrativas',\n",
" 'administrativo',\n",
" 'admira',\n",
" 'admirables',\n",
" 'admiración',\n",
" 'admiró',\n",
" 'admisión',\n",
" 'admita',\n",
" 'admiten',\n",
" 'admitiría',\n",
" 'admitió',\n",
" 'adn',\n",
" 'adolescentes',\n",
" 'adolfo',\n",
" 'adopción',\n",
" 'adoptado',\n",
" 'adoptó',\n",
" 'adquiera',\n",
" 'adquieran',\n",
" 'adquirida',\n",
" 'adquirir',\n",
" 'adquirió',\n",
" 'adrian',\n",
" 'adriana',\n",
" 'adrián',\n",
" 'adriático',\n",
" 'adueñado',\n",
" 'adulto',\n",
" 'adultos',\n",
" 'advertencia',\n",
" 'advertía',\n",
" 'advierte',\n",
" 'advierten',\n",
" 'advirtieron',\n",
" 'advirtió',\n",
" 'aeronave',\n",
" 'aeroplano',\n",
" 'aeropuertos',\n",
" 'afamado',\n",
" 'afcl',\n",
" 'afectación',\n",
" 'afectada',\n",
" 'afectadas',\n",
" 'afectados',\n",
" 'afectando',\n",
" 'afectará',\n",
" 'afecten',\n",
" 'afectiva',\n",
" 'afecto',\n",
" 'affair',\n",
" 'affeldt',\n",
" 'afianzar',\n",
" 'aficionada',\n",
" 'aficionado',\n",
" 'aficionados',\n",
" 'afición',\n",
" 'afilado',\n",
" 'afiliación',\n",
" 'afiliados',\n",
" 'afinidad',\n",
" 'afirma',\n",
" 'afirmaba',\n",
" 'afirmando',\n",
" 'afirmar',\n",
" 'afirmaron',\n",
" 'afirmó',\n",
" 'aflojado',\n",
" 'afluencia',\n",
" 'aforismos',\n",
" 'afortunadamente',\n",
" 'afortunados',\n",
" 'afp',\n",
" 'afrontar',\n",
" 'afuera',\n",
" 'again',\n",
" 'agarró',\n",
" 'agencia',\n",
" 'agenda',\n",
" 'agendado',\n",
" 'agendas',\n",
" 'agente',\n",
" 'agitado',\n",
" 'aglutina',\n",
" 'agn',\n",
" 'ago',\n",
" 'agosto',\n",
" 'agotadas',\n",
" 'agotar',\n",
" 'agote',\n",
" 'agradable',\n",
" 'agradará',\n",
" 'agradecer',\n",
" 'agradecerle',\n",
" 'agradecerles',\n",
" 'agradecida',\n",
" 'agradecido',\n",
" 'agradecidos',\n",
" 'agradecieron',\n",
" 'agradecimiento',\n",
" 'agradecimientos',\n",
" 'agradeció',\n",
" 'agradezco',\n",
" 'agradó',\n",
" 'agrarias',\n",
" 'agrarios',\n",
" 'agravia',\n",
" 'agravios',\n",
" 'agrega',\n",
" 'agregar',\n",
" 'agregaron',\n",
" 'agregó',\n",
" 'agresiones',\n",
" 'agresión',\n",
" 'agrupación',\n",
" 'agrupados',\n",
" 'agrícolas',\n",
" 'agua',\n",
" 'aguada',\n",
" 'aguascalientes',\n",
" 'aguayo',\n",
" 'aguda',\n",
" 'agudizaron',\n",
" 'agudo',\n",
" 'aguilar',\n",
" 'aguilera',\n",
" 'aguinaldo',\n",
" 'agujero',\n",
" 'agujeros',\n",
" 'agustina',\n",
" 'agustín',\n",
" 'ahora',\n",
" 'ahorita',\n",
" 'ahorro',\n",
" 'ahí',\n",
" 'aintree',\n",
" 'aire',\n",
" 'aires',\n",
" 'airlines',\n",
" 'ajeno',\n",
" 'ajustarme',\n",
" 'ajustarse',\n",
" 'ajustes',\n",
" 'akc',\n",
" 'akron',\n",
" 'al',\n",
" 'ala',\n",
" 'alabama',\n",
" 'alambre',\n",
" 'alameda',\n",
" 'alanís',\n",
" 'alas',\n",
" 'albazo',\n",
" 'alberca',\n",
" 'albercas',\n",
" 'alberga',\n",
" 'albergan',\n",
" 'albergar',\n",
" 'albergó',\n",
" 'albert',\n",
" 'alberto',\n",
" 'alboa',\n",
" 'alborada',\n",
" 'albufeira',\n",
" 'albán',\n",
" 'alcadías',\n",
" 'alcalde',\n",
" 'alcaldes',\n",
" 'alcaldesa',\n",
" 'alcaldias',\n",
" 'alcaldía',\n",
" 'alcaldías',\n",
" 'alcalá',\n",
" 'alcances',\n",
" 'alcanzado',\n",
" 'alcanzados',\n",
" 'alcanzar',\n",
" 'alcanzará',\n",
" 'alcanzó',\n",
" 'alcohol',\n",
" 'alcántara',\n",
" 'alcázar',\n",
" 'ale',\n",
" 'alebrije',\n",
" 'alebrijes',\n",
" 'aleccionadora',\n",
" 'aledañas',\n",
" 'alegre',\n",
" 'alegres',\n",
" 'alegria',\n",
" 'alegría',\n",
" 'alejada',\n",
" 'alejados',\n",
" 'alejandra',\n",
" 'alejandro',\n",
" 'alejará',\n",
" 'alemania',\n",
" 'alemán',\n",
" 'alentado',\n",
" 'alentador',\n",
" 'alerta',\n",
" 'alertan',\n",
" 'alertaron',\n",
" 'alertó',\n",
" 'alex',\n",
" 'alfabetización',\n",
" 'alfombra',\n",
" 'alfonso',\n",
" 'alfredo',\n",
" 'algarve',\n",
" 'algo',\n",
" 'algodoneros',\n",
" 'alguien',\n",
" 'alguna',\n",
" 'algunas',\n",
" 'alguno',\n",
" 'algunos',\n",
" 'algún',\n",
" 'aliados',\n",
" 'alicante',\n",
" 'alicia',\n",
" 'aliento',\n",
" 'alimentación',\n",
" 'alimentar',\n",
" 'alinea',\n",
" 'alista',\n",
" 'alix',\n",
" 'all',\n",
" 'allen',\n",
" 'allende',\n",
" 'allá',\n",
" 'allí',\n",
" 'almacén',\n",
" 'almadía',\n",
" 'almas',\n",
" 'almaty',\n",
" 'almazán',\n",
" 'almeyda',\n",
" 'alonso',\n",
" 'alpino',\n",
" 'alrededor',\n",
" 'alta',\n",
" 'altamente',\n",
" 'altar',\n",
" 'altares',\n",
" 'altas',\n",
" 'alterado',\n",
" 'altered',\n",
" 'alterna',\n",
" 'alternar',\n",
" 'alternativa',\n",
" 'alternativas',\n",
" 'altitud',\n",
" 'alto',\n",
" 'altos',\n",
" 'altura',\n",
" 'alturas',\n",
" 'alude',\n",
" 'alumnos',\n",
" 'alusión',\n",
" 'alvarado',\n",
" 'alvear',\n",
" 'alza',\n",
" 'alzando',\n",
" 'alzarse',\n",
" 'alzueta',\n",
" 'alzó',\n",
" 'am',\n",
" 'amables',\n",
" 'amados',\n",
" 'amadrinada',\n",
" 'amaga',\n",
" 'amalia',\n",
" 'amanecer',\n",
" 'amantes',\n",
" 'amarillas',\n",
" 'amarillo',\n",
" 'amarrado',\n",
" 'amateur',\n",
" 'amateurs',\n",
" 'amaury',\n",
" 'amazon',\n",
" 'amazonía',\n",
" 'amb',\n",
" 'ambas',\n",
" 'amberes',\n",
" 'ambientada',\n",
" 'ambiental',\n",
" 'ambiente',\n",
" 'ambientes',\n",
" 'ambos',\n",
" 'amena',\n",
" 'amenaza',\n",
" 'amenazas',\n",
" 'amenazó',\n",
" 'amenos',\n",
" 'amercia',\n",
" 'america',\n",
" 'american',\n",
" 'americana',\n",
" 'americanista',\n",
" 'americanistas',\n",
" 'americano',\n",
" 'amigas',\n",
" 'amigo',\n",
" 'amigos',\n",
" 'amistoso',\n",
" 'amlo',\n",
" 'amo',\n",
" 'amonestaciones',\n",
" 'amor',\n",
" 'amorcito',\n",
" 'amorebieta',\n",
" 'amorosa',\n",
" 'amp',\n",
" 'amparo',\n",
" 'amplia',\n",
" 'ampliación',\n",
" 'ampliamente',\n",
" 'ampliar',\n",
" 'amplias',\n",
" 'amplio',\n",
" 'amplitud',\n",
" 'amplía',\n",
" 'amstel',\n",
" 'américa',\n",
" 'américas',\n",
" 'ana',\n",
" 'anahuacalli',\n",
" 'anaiza',\n",
" 'analiza',\n",
" 'analizamos',\n",
" 'analizan',\n",
" 'analizando',\n",
" 'analizar',\n",
" 'analizarlas',\n",
" 'analizarlo',\n",
" 'analizaron',\n",
" 'analizará',\n",
" 'analizarán',\n",
" 'analogía',\n",
" 'ancer',\n",
" 'ancestral',\n",
" 'ancestrales',\n",
" 'anciana',\n",
" 'andalucía',\n",
" 'andar',\n",
" 'andorra',\n",
" 'andrea',\n",
" 'andrés',\n",
" 'andy',\n",
" 'anette',\n",
" 'anfiteatro',\n",
" 'anfitriones',\n",
" 'anfitrión',\n",
" 'angel',\n",
" 'angelicales',\n",
" 'anglófona',\n",
" 'angustias',\n",
" 'anhelada',\n",
" 'anhelo',\n",
" 'anillos',\n",
" 'animación',\n",
" 'animada',\n",
" 'animador',\n",
" 'animal',\n",
" 'animales',\n",
" 'animar',\n",
" 'aniversario',\n",
" 'annabella',\n",
" 'anne',\n",
" 'annia',\n",
" 'annika',\n",
" 'ano',\n",
" 'anoche',\n",
" 'anonimato',\n",
" 'anos',\n",
" 'anotaron',\n",
" 'anotes',\n",
" 'anotó',\n",
" 'ansioso',\n",
" 'antagónicos',\n",
" 'ante',\n",
" 'antecedente',\n",
" 'antecesoras',\n",
" 'antepasados',\n",
" 'antequera',\n",
" 'anterior',\n",
" 'anteriores',\n",
" 'anterioridad',\n",
" 'anteriormente',\n",
" 'antes',\n",
" 'anthony',\n",
" 'antiautoritario',\n",
" 'anticipa',\n",
" 'anticipación',\n",
" 'antigua',\n",
" 'antiguas',\n",
" 'antiguo',\n",
" 'antiguos',\n",
" 'antigüedad',\n",
" 'antigüedades',\n",
" 'antimigrante',\n",
" 'antimigrantes',\n",
" 'antipatica',\n",
" 'antipática',\n",
" 'antología',\n",
" 'antonio',\n",
" 'antropología',\n",
" 'antropóloga',\n",
" 'anual',\n",
" 'anualmente',\n",
" 'anuario',\n",
" 'anuncia',\n",
" 'anunciación',\n",
" 'anunciada',\n",
" 'anunciado',\n",
" 'anunciados',\n",
" 'anunciantes',\n",
" 'anunciar',\n",
" 'anunciaron',\n",
" 'anunciará',\n",
" 'anuncias',\n",
" 'anuncio',\n",
" 'anuncios',\n",
" 'anunciándolo',\n",
" 'anunció',\n",
" 'anáhuac',\n",
" 'análisis',\n",
" 'anécdota',\n",
" 'aorozco',\n",
" 'ap',\n",
" 'apango',\n",
" 'aparecer',\n",
" 'aparecieron',\n",
" 'apareció',\n",
" 'aparentemente',\n",
" 'aparicio',\n",
" 'apariciones',\n",
" 'aparición',\n",
" 'apariencia',\n",
" 'apartado',\n",
" 'apartados',\n",
" 'aparte',\n",
" 'apasionada',\n",
" 'apasionado',\n",
" 'apasionados',\n",
" 'apasionante',\n",
" 'apelación',\n",
" 'apelen',\n",
" 'apenado',\n",
" 'apenas',\n",
" 'apendicitis',\n",
" 'apertura',\n",
" 'apeshit',\n",
" 'aplauso',\n",
" 'aplausos',\n",
" 'aplazan',\n",
" 'aplazará',\n",
" 'aplazó',\n",
" 'aplica',\n",
" 'aplicación',\n",
" 'aplicar',\n",
" 'aplicaron',\n",
" 'aplicará',\n",
" 'apoderado',\n",
" 'aportación',\n",
" 'aportar',\n",
" 'aporte',\n",
" 'aporten',\n",
" 'aportes',\n",
" 'apostó',\n",
" ...]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# vamos a utilizar la librería de Python scikit-learn\n",
"\n",
"from sklearn.feature_extraction import DictVectorizer, FeatureHasher\n",
"\n",
"vectorizer = DictVectorizer()\n",
"vectorizer.fit_transform(tokens_frequency(d) for d in documents)\n",
"\n",
"vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2089"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.vocabulary_.get('chicago')"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# representación documentos de entrenamiento (tf-idf), \n",
"# preparando para el algoritmo de aprendizaje (K-means clustering),\n",
"# para lo que también utilizaremos scikit-learn:\n",
"\n",
"\n",
"prepositions =['a','ante','bajo','cabe','con','contra','de','desde','en','entre','hacia','hasta','para','por','según','sin','so','sobre','tras']\n",
"prep_alike = ['durante','mediante','excepto','salvo','incluso','más','menos']\n",
"adverbs = ['no','si','sí']\n",
"articles = ['el','la','los','las','un','una','unos','unas','este','esta','estos','estas','aquel','aquella','aquellos','aquellas']\n",
"aux_verbs = ['he','has','ha','hemos','habéis','han','había','habías','habíamos','habíais','habían']\n",
"tfid = TfidfVectorizer(stop_words=prepositions+prep_alike+adverbs+articles+aux_verbs)\n",
"\n",
"X_train = tfid.fit_transform(documents)\n",
"y_train = labels\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=2, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creamos al clasificador (clf) que puede predecir la categoría de un artículo. \n",
"# Probamos con un conjunto nuevo de datos \n",
"\n",
"\n",
"\n",
"clf = KNeighborsClassifier(n_neighbors=2)\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
" stop_words=['a', 'ante', 'bajo', 'cabe', 'con', 'contra', 'de', 'desde', 'en', 'entre', 'hacia', 'hasta', 'para', 'por', 'según', 'sin', 'so', 'sobre', 'tras', 'durante', 'mediante', 'excepto', 'salvo', 'incluso', 'más', 'menos', 'no', 'si', 'sí', 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'este', 'esta', 'estos', 'estas', 'aquel', 'aquella', 'aquellos', 'aquellas', 'he', 'has', 'ha', 'hemos', 'habéis', 'han', 'había', 'habías', 'habíamos', 'habíais', 'habían'],\n",
" strip_accents=None, sublinear_tf=False,\n",
" token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
" vocabulary=None)"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfid"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy score 1.000\n"
]
}
],
"source": [
"test = read_all_documents(r'C:\\BBNoticias\\test')\n",
"X_test = tfid.transform(test['docs'])\n",
"y_test = test['labels']\n",
"pred = clf.predict(X_test)\n",
"\n",
"print('accuracy score %0.3f' % clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment