update

5df03808 · Mario Chirinos Colunga · 90fc9336 · 5df03808 · 90fc9336 · 90fc9336
Commit 5df03808 authored Mar 06, 2022 by Mario Chirinos Colunga
30 changed files
--- a/README.md
+++ b/README.md
+* M³-Descarga de Noticias e Texto
 Crawlers para medios de información en linea nacionales basados en [Scrapy](http://scrapy.org/)
 Se incluyen los siguientes medios nacionales:
@@ -14,752 +15,3 @@ Se incluyen los siguientes medios nacionales:
 	scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 	```
 	No se encontró notas antes del 2014.06.01.
-* [Campeche Hoy](http://campechehoy.mx)
-	Acceso por día:
-	```bash
-	http://campechehoy.mx/2018/01/22/
-	```
-	Uso:
-	```bash
-	cd campecheHoy/
-	scrapy crawl noticias --nolog -s filename=2018-01-22.json -a year=2018 -a month=1 -a day=22
-	```
-	No se encontró notas antes del 2017.06.01.
-* [Chiapas Hoy](http://www.chiapashoy.com.mx)
-	Acceso por día:
-	```bash
-	http://www.chiapashoy.com.mx/notashoy/2018/01/25/
-	```
-	Uso:
-	```bash
-	cd chiapasHoy/
-	scrapy crawl noticias --nolog -s filename=2018-01-25.json -a year=2018 -a month=1 -a day=25
-	```
-	No se encontró notas antes del 2017.03.08.
-* [Cuarto Poder](http://www.cuartopoder.mx)
-	Acceso por día:
-	```bash
-	http://www.cuartopoder.mx/archivo/portada/listado/8-30-2018/8-30-2018/
-	```
-	Uso:
-	```bash
-	cd cuartoPoder/
-	scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
-	```
-	No se encontró notas antes del 2010.01.01.
-* [Desde el Balcón](http://www.desdeelbalcon.com)
-	Acceso por día:
-	```bash
-	http://www.desdeelbalcon.com/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd desdeElBalcon/
-	scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
-	```
-	No se encontró notas antes del 2014.01.11.
-* [El Despertar de Oaxaca](http://despertardeoaxaca.com)
-	Acceso por día:
-	```bash
-	http://despertardeoaxaca.com/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd despertarOaxaca/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2011.12.06.
-* [Puntual](http://diario-puntual.com.mx)
-	Acceso por día:
-	```bash
-	http://diario-puntual.com.mx/2018/9/5/
-	```
-	Uso:
-	```bash
-	cd diarioPuntual/
-	scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
-	```
-	No se encontró notas antes del 2016.07.12.
-* [Diario del Yaqui](http://diariodelyaqui.mx)
-	Acceso por día:
-	```bash
-	http://diariodelyaqui.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd diarioYaqui/
-	scrapy crawl noticias --nolog -s filename=2016-12-24.json -a year=2016 -a month=12 -a day=24
-	```
-	No se encontró notas antes del 2016.12.30.
-* [Diario de Yucatán](http://yucatan.com.mx)
-	Uso:
-	```bash
-	cd diarioYucatan/
-	scrapy crawl noticias --nolog -s filename=noticias.json
-	```
-	No se encontró notas antes del 2012.04.02.
-* [EDOMEX al Día](http://www.edomexaldia.com.mx)
-    Acceso por día:
-	```bash
-	http://www.edomexaldia.com.mx/2018/01/30/
-	```
-	Uso:
-	```bash
-	cd edoMexDia/
-	scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
-	```
-	No se encontró notas antes del 2011.09.21.
-* [El Comentario](https://elcomentario.ucol.mx)
-	Acceso por día:
-	```bash
-	https://elcomentario.ucol.mx/2018/9/5/
-	```
-	Uso:
-	```bash
-	cd elComentario/
-	scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
-	```
-	No se encontró notas antes del 2016.07.01.
-* [El Independiente](http://www.el-independiente.com.mx)
-	Acceso por día:
-	```bash
-	http://www.el-independiente.com.mx/2018/2/1/
-	```
-	Uso:
-	```bash
-	cd elIndependiente/
-	scrapy crawl noticias --nolog -s filename=2018-02-01.json -a year=2018 -a month=2 -a day=1
-	```
-	No se encontró notas antes del 2012.03.05.
-* [El Sur](https://suracapulco.mx)
-	Acceso por día:
-	```bash
-	hhttps://suracapulco.mx/2018/9/5/
-	```
-	Uso:
-	```bash
-	cd elSur/
-	scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
-	```
-	No se encontró notas antes del 2017.09.06.
-* [El Valle](http://elvalle.com.mx)
-	Acceso por día:
-	```bash
-	http://elvalle.com.mx/2018/2/1/
-	```
-	Uso:
-	```bash
-	cd elValle/
-	scrapy crawl noticias --nolog -s filename=2018-02-01.json -a year=2018 -a month=2 -a day=1
-	```
-	No se encontró notas antes del 2016.04.19.
-* [Expreso Chiapas](http://expresochiapas.com/noticias)
-	Acceso por día:
-	```bash
-	http://expresochiapas.com/noticias/2018/1/30/
-	```
-	Uso:
-	```bash
-	cd expresoChiapas/
-	scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
-	```
-	No se encontró notas antes del 2015.09.26.
-* [El Grillo Porteño](http://grilloporteno.com)
-	Acceso por día:
-	```bash
-	http://grilloporteno.com/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd grilloPorteno/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2013.11.04.
-* [El Heraldo Aguascalientes](http://www.heraldo.mx)
-	Acceso por día:
-	```bash
-	http://www.heraldo.mx/2018/01/22/
-	```
-	Uso:
-	```bash
-	cd heraldoAgs/
-	scrapy crawl noticias --nolog -s filename=2018-01-22.json -a year=2018 -a month=1 -a day=22
-	```
-	No se encontró notas antes del 2014.01.23.
-* [El Heraldo León](http://www.heraldoleon.mx)
-	Acceso por día:
-	```bash
-	http://www.heraldoleon.mx/2018/01/29/
-	```
-	Uso:
-	```bash
-	cd heraldoLeon/
-	scrapy crawl noticias --nolog -s filename=2018-01-29.json -a year=2018 -a month=1 -a day=29
-	```
-	No se encontró notas antes del 2016.07.06.
-* [La Jornada](http://www.jornada.unam.mx)
-	Acceso por día:
-	```bash
-	http://www.jornada.unam.mx/2017/09/13/
-	```
-	Uso:
-	```bash
-	cd laJornada/
-	scrapy crawl noticias --nolog -s filename=2017-04-23.json -a year=2017 -a month=4 -a day=23
-	```
-	No se encontró notas antes del 1996.03.03.
-* [La Jornada Aguascalientes](http://www.lja.mx)
-	Acceso por día:
-	```bash
-	http://www.lja.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd laJornadaAgs/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2008.12.01.
-* [La Jornada Baja California](http://jornadabc.mx)
-	Uso:
-	```bash
-	cd laJornadaBC/
-	scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=4 -a day=26
-	```
-	No se encontró notas antes del 2015.02.09.
-	Esta versión descarga por fecha. Se recomienda usar principalmente para fechas recientes.
-* [La Jornada Guerrero (Nueva página)](http://www.lajornadaguerrero.com.mx)
-	Acceso por día: 
-	```bash
-	http://www.lajornadaguerrero.com.mx/index.php?option=com_k2&view=itemlist&task=date&year=2017&month=9&day=17&Itemid=588
-	```
-	Uso:
-	```bash
-	cd laJornadaGro/
-	scrapy crawl noticias --nolog -s filename=2017-09-17.json -a year=2017 -a month=9 -a day=17
-	```
-	No se encontró notas antes del 2017.08.15 para esta version del crawler.
-	En general se tienen notas desde el 2007.01.22
-* [La Jornada Maya](https://www.lajornadamaya.mx)
-	Uso:
-	```bash
-	cd laJornadaMaya/
-	scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=4 -a day=25
-	```
-	No se encontró notas antes del 2015.03.12.
-* [La Jornada de Oriente](http://www.lajornadadeoriente.com.mx)
-	Acceso por día:
-	```bash
-	http://www.lajornadadeoriente.com.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd laJornadaOte/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2013.06.01.
-* [La Jornada San Luis](http://lajornadasanluis.com.mx)
-	Acceso por día:
-	```bash
-	http://lajornadasanluis.com.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd laJornadaSanLuis/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2014.10.08.
-* [La Jornada Veracruz](http://www.jornadaveracruz.com.mx)
-	Acceso por día:
-	```bash
-	http://www.jornadaveracruz.com.mx/Archive.aspx?date=13/09/2017
-	```
-	Uso:
-	```bash
-	cd laJornadaVer/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2009.05.11.
-* [La Jornada Zacatecas](http://ljz.mx)
-	Acceso por día:
-	```bash
-	http://ljz.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd laJornadaZac/
-	scrapy crawl noticias --nolog -s filename=2017-03-26.json -a year=2017 -a month=3 -a day=26
-	```
-	No se encontró notas antes del 2013.06.10.
-* [La Razón](https://www.razon.com.mx)
-	Acceso por día:
-	```bash
-	https://www.razon.com.mx/2017/9/28/
-	```
-	Uso:
-	```bash
-	cd laRazon/
-	scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
-	```
-	No se encontró notas antes del 2015.01.01.
-* [La Verdad Yucatán](http://laverdadnoticias.com)
-	Acceso por día:
-	```bash
-	https://laverdadnoticias.com/2017/9/16/
-	```
-	Uso:
-	```bash
-	cd laVerdadYuc/
-	scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=24
-	```
-	No se encontró notas antes del 2017.07.18.
-* [Lector MX](http://lectormx.com)
-	Acceso por día:
-	```bash
-	http://lectormx.com/2017/3/30/
-	```
-	Uso:
-	```bash
-	cd lectorMX/
-	scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
-	```
-	No se encontró notas antes del 2015.10.23.
-* [Marca](http://www.diariomarca.com.mx)
-	Acceso por día:
-	```bash
-	http://www.diariomarca.com.mx/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd marca/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2012.07.31.
-* [Mi Punto de Vista](http://www.mipuntodevista.com.mx)
-	Acceso por día:
-	```bash
-	http://www.mipuntodevista.com.mx/2017/9/16/
-	```
-	Uso:
-	```bash
-	cd miPuntoDeVista/
-	scrapy crawl noticias --nolog -s filename=2017-03-28.json -a year=2017 -a month=3 -a day=28
-	```
-	No se encontró notas antes del 2008.10.04.
-* [Noticias de la Bahía](https://noticiasdelabahia.com)
-	Acceso por día:
-	```bash
-	https://noticiasdelabahia.com/2018/2/1/
-	```
-	Uso:
-	```bash
-	cd noticiasBahia/
-	scrapy crawl noticias --nolog -s filename=2018-02-01.json -a year=2018 -a month=2 -a day=1
-	```
-	No se encontró notas antes del 2016.05.23.
-* [El Noticiero en Línea](http://www.elnoticieroenlinea.com)
-	Acceso por día:
-	```bash
-	http://www.elnoticieroenlinea.com/2018/1/3/
-	```
-	Uso:
-	```bash
-	cd noticieroLinea/
-	scrapy crawl noticias --nolog -s filename=2018-01-03.json -a year=2018 -a month=1 -a day=3
-	```
-	No se encontró notas antes del 2014.01.16.
-* [Notirivas](http://gruporivas.com.mx/notirivas)
-	Acceso por día:
-	```bash
-	http://gruporivas.com.mx/notirivas/2017/9/16/
-	```
-	Uso:
-	```bash
-	cd notirivas/
-	scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
-	```
-	No se encontró notas antes del 2016.11.22.
-* [Notisureste](http://www.notisureste.com)
-	Acceso por día:
-	```bash
-	http://www.notisureste.com/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd notisureste/
-	scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
-	```
-	No se encontró notas antes del 2011.11.28.
-* [La Opinión de Puebla](http://www.laopinionpuebla.com)
-	Acceso por día:
-	```bash
-	http://www.laopinionpuebla.com/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd opinionPuebla/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2011.07.01.
-* [Periodico Correo](https://periodicocorreo.com.mx)
-	Acceso por día:
-	```bash
-	https://periodicocorreo.com.mx/2018/1/29/
-	```
-	Uso:
-	```bash
-	cd periodicoCorreo/
-	scrapy crawl noticias --nolog -s filename=2018-01-29.json -a year=2018 -a month=1 -a day=29
-	```
-	No se encontró notas antes del 2013.09.08.
-* [Periódico Enfoque Informativo](https://enfoquenayarit.com)
-	Acceso por día:
-	```bash
-	https://enfoquenayarit.com/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd periodicoEnfoque/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2014.12.19.
-* [Periódico Express](http://www.periodicoexpress.com.mx)
-	Acceso por día:
-	```bash
-	http://www.periodicoexpress.com.mx/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd periodicoExpress/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2017.03.25.
-* [Periodico Victoria](http://periodicovictoria.mx)
-	Acceso por día:
-	```bash
-	http://periodicovictoria.mx/2018/1/28/
-	```
-	Uso:
-	```bash
-	cd periodicoVictoria/
-	scrapy crawl noticias --nolog -s filename=2018-01-28.json -a year=2018 -a month=1 -a day=28
-	```
-	No se encontró notas antes del 2013.10.09.
-* [Por Esto](http://www.poresto.net)
-	Acceso por día:
-	```bash
-	http://www.poresto.net/2018/8/22/
-	```
-	Uso:
-	```bash
-	cd porEsto/
-	scrapy crawl noticias --nolog -s filename=2018-08-22.json -a year=2018 -a month=8 -a day=22
-	```
-	No se encontró notas antes del 2018.07.03.
-* [Proceso](http://www.proceso.com.mx)
-	Uso:
-	```bash
-	cd proceso/
-	scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3
-	```
-	No se encontró notas antes de Nov. 1976.
-* [Punto Medio](https://www.puntomedio.mx)
-	Acceso por día:
-	```bash
-	https://www.puntomedio.mx/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd puntoMedio/
-	scrapy crawl noticias --nolog -s filename=2017-03-27.json -a year=2017 -a month=3 -a day=27
-	```
-	No se encontró notas antes del 2015.08.02.
-* [Red Crucero](http://www.red-crucero.com)
-	Acceso por día:
-	```bash
-	http://www.red-crucero.com/news/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd redCrucero/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2016.02.08.
-* [Síntesis](https://www.sintesis.mx)
-	Acceso por día:
-	```bash
-	https://www.sintesis.mx/2018/2/4/
-	```
-	Uso:
-	```bash
-	cd sintesis/
-	scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
-	```
-	No se encontró notas antes del 2016.10.02.
-* [Sona 89.3](http://sona893.fm)
-	Acceso por día:
-	```bash
-	http://sona893.fm/2017/09/13/
-	```
-	Uso:
-	```bash
-	cd sona893/
-	scrapy crawl noticias --nolog -s filename=2017-03-24.json -a year=2017 -a month=3 -a day=24
-	```
-	No se encontró notas antes del 2012.04.09.
-* [El Sur de Campeche](http://www.elsur.mx)
-	Acceso por día:
-	```bash
-	http://www.elsur.mx/2018/8/10/
-	```
-	Uso:
-	```bash
-	cd surDeCampeche/
-	scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
-	```
-	No se encontró notas antes del 2011.10.01.
-* [Tribuna de los Cabos](http://www.tribunadeloscabos.com.mx/)
-	Acceso por día:
-	```bash
-	http://www.tribunadeloscabos.com.mx/2018/01/18/
-	```
-	Uso:
-	```bash
-	cd tribunaCabos/
-	scrapy crawl noticias --nolog -s filename=2018-01-18.json -a year=2018 -a month=1 -a day=18
-	```
-	No se encontró notas antes del 2016.06.01.
-* [Tribuna](http://tribunacampeche.com)
-	Acceso por día:
-	```bash
-	http://tribunacampeche.com/2018/8/10/
-	```
-	Uso:
-	```bash
-	cd tribunaCampeche/
-	scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
-	```
-	No se encontró notas antes del 2013.11.28.
-* [Uno Más Uno](http://www.unomasuno.com.mx)
-	Acceso por día:
-	```bash
-	http://www.unomasuno.com.mx/index.php/2017/09/22/
-	```
-	Uso:
-	```bash
-	cd unoMasUno/
-	scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=09 -a day=22
-	```
-	No se encontró notas antes del 2017.05.04.
-* [Yucatán a la Mano](http://www.yucatanalamano.com)
-	Acceso por día:
-	```bash
-	http://www.yucatanalamano.com/2017/9/13/
-	```
-	Uso:
-	```bash
-	cd yucatanALaMano/
-	scrapy crawl noticias --nolog -s filename=2017-03-25.json -a year=2017 -a month=3 -a day=25
-	```
-	No se encontró notas antes del 2015.05.11.
-* [Yucatán al Minuto](http://www.yucatanalminuto.com)
-	Uso:
-	```bash
-	cd yucatanAlMinuto/
-	scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
-	```
-	No se encontró notas antes del 2017.01.17.
-* [Yucatán en Corto](http://florcastillo.mx/noticias)
-	Acceso por día:
-	```bash
-	http://www.yucatanencorto.com/noticias/2017/12/15
-	```
-	Uso:
-	```bash
-	cd yucatanEnCorto/
-	scrapy crawl noticias --nolog -s filename=2017-03-29.json -a year=2017 -a month=3 -a day=29
-	```
-	No se encontró notas antes del 2017.10.18 para esta version del crawler.
-	En general se tienen notas desde el 2011.04.02.
-Adicionalmente se cuenta con los siguientes medios extranjeros:
-* [Diario Co Latino, El Salvador](http://www.diariocolatino.com)
-	Acceso por día:
-	```bash
-	http://www.diariocolatino.com/2018/2/23/
-	```
-	Uso:
-	```bash
-	cd descarga_por_dia/foraneos/diarioCoLatino/
-	scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
-	```
-	No se encontró notas antes del 2013.10.04.
-* [El Heraldo, Honduras](http://www.elheraldo.hn)
-    Uso:
-	```bash
-	cd descarga_hacia_atras/foraneos/heraldoHn
-	scrapy crawl noticias --nolog -s filename=noticias.json     // obtiene todas las posibles
-	scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=9     //obtiene hasta una fecha dada
-	```
-	No se encontró notas antes del 2017.04.29.
-* [La Prensa Gráfica, El Salvador](https://www.laprensagrafica.com)
-    Uso:
-	```bash
-	cd descarga_hacia_atras/foraneos/prensaGrafica/
-	scrapy crawl noticias --nolog -s filename=noticias.json     // obtiene todas las posibles
-	scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29     //obtiene hasta una fecha dada
-	```
-	No se encontró notas antes del 2017.09.05.
-* [The San Pedro Sun, Belice](https://www.sanpedrosun.com)
-	Acceso por día:
-	```bash
-	https://www.sanpedrosun.com/2018/2/23/
-	```
-	Uso:
-	```bash
-	cd descarga_por_dia/foraneos/sanPedroSun/
-	scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
-	```
-	No se encontró notas antes del 2008.07.21.
-* [Tiempo Digital, Honduras](http://tiempo.hn)
-	Acceso por día:
-	```bash
-	http://tiempo.hn/2018/2/23/
-	```
-	Uso:
-	```bash
-	cd descarga_por_dia/foraneos/tiempoDigitalHn/
-	scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
-	```
-	No se encontró notas antes del 2015.04.17.
-* [La Tribuna, Honduras](http://www.latribuna.hn)
-    Uso:
-	```bash
-	cd descarga_hacia_atras/foraneos/tribunaHn/
-	scrapy crawl noticias --nolog -s filename=noticias.json     // obtiene todas las posibles
-	scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29     //obtiene hasta una fecha dada
-	```
-	No se encontró notas antes del 2015.02.12.
--- a/crawler_data.json
+++ b/crawler_data.json
-[{"nombre": "Al Chile", "crawler": "descarga_por_dia/alChile", "desde": "01-06-2014", "url": "http://alchile.com.mx"},
-{"nombre": "Campeche Hoy", "crawler": "descarga_por_dia/campecheHoy", "desde": "01-06-2017", "url": "http://campechehoy.mx"},
-{"nombre": "Chiapas Hoy", "crawler": "descarga_por_dia/chiapasHoy", "desde": "08-03-2017", "url": "http://www.chiapashoy.com.mx"},
-{"nombre": "Cuarto Poder", "crawler": "descarga_por_dia/cuartoPoder", "desde": "01-01-2010", "url": "http://www.cuartopoder.mx"},
-{"nombre": "Desde el Balcon", "crawler": "descarga_por_dia/desdeElBalcon", "desde": "11-01-2014", "url": "http://www.desdeelbalcon.com"},
-{"nombre": "El Despertar de Oaxaca", "crawler": "descarga_por_dia/despertarOaxaca", "desde": "06-12-2011", "url": "http://despertardeoaxaca.com"},
-{"nombre": "Puntual", "crawler": "descarga_por_dia/diarioPuntual", "desde": "12-07-2016", "url": "http://diario-puntual.com.mx"},
-{"nombre": "Diario del Yaqui", "crawler": "descarga_por_dia/diarioYaqui", "desde": "30-12-2016", "url": "http://diariodelyaqui.mx"},
-{"nombre": "Diario de Yucatan", "crawler": "descarga_hacia_atras/diarioYucatan", "desde": "02-04-2012", "url": "http://yucatan.com.mx"},
-{"nombre": "EDOMEX al Dia", "crawler": "descarga_por_dia/edoMexDia", "desde": "21-09-2011", "url": "http://www.edomexaldia.com.mx"},
-{"nombre": "El Comentario", "crawler": "descarga_por_dia/elComentario", "desde": "01-07-2016", "url": "https://elcomentario.ucol.mx"},
-{"nombre": "El Independiente", "crawler": "descarga_por_dia/elIndependiente", "desde": "05-03-2012", "url": "http://www.el-independiente.com.mx"},
-{"nombre": "El Sur", "crawler": "descarga_por_dia/elSur", "desde": "06-09-2017", "url": "https://suracapulco.mx"},
-{"nombre": "El Valle", "crawler": "descarga_por_dia/elValle", "desde": "19-04-2016", "url": "http://elvalle.com.mx"},
-{"nombre": "Expreso Chiapas", "crawler": "descarga_por_dia/expresoChiapas", "desde": "26-09-2015", "url": "http://expresochiapas.com/noticias"},
-{"nombre": "El Grillo", "crawler": "descarga_por_dia/grilloPorteno", "desde": "04-11-2013", "url": "http://grilloporteno.com"},
-{"nombre": "El Heraldo Aguascalientes", "crawler": "descarga_por_dia/heraldoAgs", "desde": "23-01-2014", "url": "http://www.heraldo.mx"},
-{"nombre": "El Heraldo Leon", "crawler": "descarga_por_dia/heraldoLeon", "desde": "06-07-2016", "url": "http://www.heraldoleon.mx"},
-{"nombre": "La Jornada", "crawler": "descarga_por_dia/laJornada", "desde": "01-02-2005", "url": "http://www.jornada.unam.mx"},
-{"nombre": "La Jornada Aguascalientes", "crawler": "descarga_por_dia/laJornadaAgs", "desde": "01-12-2008", "url": "http://www.lja.mx"},
-{"nombre": "La Jornada Baja California", "crawler": "descarga_hacia_atras/laJornadaBC", "desde": "09-02-2015", "url": "http://jornadabc.mx"},
-// {"nombre": "La Jornada Guerrero", "crawler": "descarga_por_dia/laJornadaGro", "desde": "22-01-2007", "url": "http://www.lajornadaguerrero.com.mx"},
-{"nombre": "La Jornada Guerrero (Nuevo)", "crawler": "descarga_por_dia/laJornadaGro", "desde": "15-08-2017", "url": "http://www.lajornadaguerrero.com.mx"},
-{"nombre": "La Jornada Maya", "crawler": "descarga_hacia_atras/laJornadaMaya", "desde": "12-03-2015", "url": "https://www.lajornadamaya.mx"},
-{"nombre": "La Jornada de Oriente", "crawler": "descarga_por_dia/laJornadaOte", "desde": "01-06-2013", "url": "http://www.lajornadadeoriente.com.mx"},
-{"nombre": "La Jornada San Luis", "crawler": "descarga_por_dia/laJornadaSanLuis", "desde": "08-10-2014", "url": "http://lajornadasanluis.com.mx"},
-{"nombre": "La Jornada Veracruz", "crawler": "descarga_por_dia/laJornadaVer", "desde": "11-05-2009", "url": "http://www.jornadaveracruz.com.mx"},
-{"nombre": "La Jornada Zacatecas", "crawler": "descarga_por_dia/laJornadaZac", "desde": "10-06-2013", "url": "http://ljz.mx"},
-{"nombre": "La Razon", "crawler": "descarga_por_dia/laRazon", "desde": "01-01-2015", "url": "https://www.razon.com.mx"},
-{"nombre": "La Verdad Yucatan", "crawler": "descarga_por_dia/laVerdadYuc", "desde": "18-07-2017", "url": "http://laverdadnoticias.com"},
-{"nombre": "Lector MX", "crawler": "descarga_por_dia/lectorMX", "desde": "23-10-2015", "url": "http://lectormx.com"},
-{"nombre": "Marca", "crawler": "descarga_por_dia/marca", "desde": "31-07-2012", "url": "http://www.diariomarca.com.mx"},
-{"nombre": "Mi Punto de Vista", "crawler": "descarga_por_dia/miPuntoDeVista", "desde": "04-10-2008", "url": "http://www.mipuntodevista.com.mx"},
-{"nombre": "Noticias de la Bahia", "crawler": "descarga_por_dia/noticiasBahia", "desde": "23-05-2016", "url": "https://noticiasdelabahia.com"},
-{"nombre": "El Noticiero en Linea", "crawler": "descarga_por_dia/noticieroLinea", "desde": "16-01-2014", "url": "http://www.elnoticieroenlinea.com"},
-{"nombre": "Notirivas", "crawler": "descarga_por_dia/notirivas", "desde": "22-11-2016", "url": "http://gruporivas.com.mx/notirivas"},
-{"nombre": "Notisureste", "crawler": "descarga_por_dia/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com"},
-{"nombre": "La Opinion de Puebla", "crawler": "descarga_por_dia/opinionPuebla", "desde": "01-07-2011", "url": "http://www.laopinionpuebla.com"},
-{"nombre": "Periodico Correo", "crawler": "descarga_por_dia/periodicoCorreo", "desde": "08-09-2013", "url": "https://periodicocorreo.com.mx"},
-{"nombre": "Periodico Enfoque Informativo", "crawler": "descarga_por_dia/periodicoEnfoque", "desde": "19-12-2014", "url": "https://enfoquenayarit.com"},
-{"nombre": "Periodico Express", "crawler": "descarga_por_dia/periodicoExpress", "desde": "25-03-2017", "url": "http://www.periodicoexpress.com.mx"},
-{"nombre": "Periodico Victoria", "crawler": "descarga_por_dia/periodicoVictoria", "desde": "09-10-2013", "url": "http://periodicovictoria.mx"},
-{"nombre": "Por Esto", "crawler": "descarga_por_dia/porEsto", "desde": "03-07-2018", "url": "http://www.poresto.net"},
-{"nombre": "Proceso", "crawler": "descarga_por_mes/proceso", "desde": "11-1976", "url": "http://www.proceso.com.mx"},
-{"nombre": "Punto Medio", "crawler": "descarga_por_dia/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx"},
-{"nombre": "Red Crucero", "crawler": "descarga_por_dia/redCrucero", "desde": "08-02-2016", "url": "http://www.red-crucero.com"},
-{"nombre": "Sintesis", "crawler": "descarga_por_dia/sintesis", "desde": "02-10-2016", "url": "https://www.sintesis.mx"},
-{"nombre": "Sona 89.3", "crawler": "descarga_por_dia/sona893", "desde": "09-04-2012", "url": "http://sona893.fm"},
-{"nombre": "El Sur de Campeche", "crawler": "descarga_por_dia/surDeCampeche", "desde": "01-10-2011", "url": "http://www.elsur.mx"},
-{"nombre": "Tribuna de los Cabos", "crawler": "descarga_por_dia/tribunaCabos", "desde": "01-06-2016", "url": "http://www.tribunadeloscabos.com.mx"},
-{"nombre": "Tribuna", "crawler": "descarga_por_dia/tribunaCampeche", "desde": "28-11-2013", "url": "http://tribunacampeche.com"},
-{"nombre": "Uno Mas Uno", "crawler": "descarga_por_dia/unoMasUno", "desde": "04-05-2017", "url": "http://www.unomasuno.com.mx"},
-{"nombre": "Yucatan a la Mano", "crawler": "descarga_por_dia/yucatanALaMano", "desde": "11-05-2015", "url": "http://www.yucatanalamano.com"},
-{"nombre": "Yucatan al Minuto", "crawler": "descarga_hacia_atras/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com"},
-{"nombre": "Yucatan en Corto", "crawler": "descarga_por_dia/yucatanEnCorto", "desde": "02-04-2011", "url": "http://www.yucatanencorto.com/noticias"},
-{"nombre": "Diario Co Latino", "crawler": "descarga_por_dia/foraneos/diarioCoLatino", "desde": "04-10-2013", "url": "https://www.diariocolatino.com"},
-{"nombre": "El Heraldo Hn", "crawler": "descarga_hacia_atras/foraneos/heraldoHn", "desde": "29-04-2017", "url": "http://www.elheraldo.hn"},
-{"nombre": "La Prensa Grafica", "crawler": "descarga_hacia_atras/foraneos/prensaGrafica", "desde": "05-09-2017", "url": "https://www.laprensagrafica.com"},
-{"nombre": "The San Pedro Sun", "crawler": "descarga_por_dia/foraneos/sanPedroSun", "desde": "21-07-2008", "url": "https://www.sanpedrosun.com"},
-{"nombre": "Tiempo Digital Hn", "crawler": "descarga_por_dia/foraneos/tiempoDigitalHn", "desde": "17-04-2015", "url": "https://tiempo.hn"},
-{"nombre": "La Tribuna Hn", "crawler": "descarga_por_dia/foraneos/tribunaHn", "desde": "12-02-2015", "url": "http://www.latribuna.hn"}]
\ No newline at end of file
--- a/crawler_often.json
+++ b/crawler_often.json
-[{"nombre": "El Financiero", "crawler": "descarga_por_rss/elFinanciero", "url": "http://www.elfinanciero.com.mx/"},
-{"nombre": "El Universal", "crawler": "descarga_por_rss/elUniversal", "url": "http://www.eluniversal.com.mx/"},
-{"nombre": "El Sol de Mexico", "crawler": "descarga_por_rss/solDeMex", "url": "https://www.elsoldemexico.com.mx/"},
-{"nombre": "Diario de Yucatan", "crawler": "descarga_hacia_atras/diarioYucatan", "url": "http://www.yucatan.com.mx/"}]
\ No newline at end of file
--- a/crawler_script/check_empty_file.py
+++ b/crawler_script/check_empty_file.py
-import os
-newsDir = '/home/geoint/virtualHDD/m3/noticias'
-# newsDir = '/home/cna_service/noticias/'
-os.chdir(newsDir)
-mediaLst = os.listdir('.')
-mediaLst.sort()
-vacios_txt = open('vacios.txt','w')
-empty_count = 0
-for media in mediaLst:
-	os.chdir(media)
-	yearLst = os.listdir('.')
-	yearLst.sort()
-	for year in yearLst:
-		os.chdir(year)
-		fileLst = os.listdir('.')
-		fileLst.sort()
-		for file in fileLst:
-			fileSize = os.stat(file).st_size
-			if not file.startswith('.') and fileSize <= 3:
-				empty_count += 1
-				if empty_count == 1:
-					vacios_txt.write(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				else:
-					vacios_txt.write('\n'+media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				print(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				os.system('rm '+file)
-		os.chdir('..')
-	os.chdir('..')
-vacios_txt.write('\n'+'Total archivos vacios: '+str(empty_count)+'\n')
-vacios_txt.close()
-print('Total archivos vacios: '+str(empty_count))
\ No newline at end of file
--- a/crawler_script/crawl_all.sh
+++ b/crawler_script/crawl_all.sh
-#!/bin/bash
-python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
--- a/crawler_script/crawler.sh
+++ b/crawler_script/crawler.sh
-#!/bin/bash
-## ------------------------------------------------------------------
-## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
-## ------------------------------------------------------------------
-site_section=( otros_sitios sitios_yucatan )
-other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
-yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
-base_path=$HOME/crawler/
-cd $base_path                    # activando el venv (entorno virtual) del crawler
-source bin/activate
-PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy       # rutas donde se encuentran el scrapy y python
-export PATH
-## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
-function obtain_days() {
-	local last_date=$1          # parametro 1
-	local stop_date=$2	        # parametro 2
-	local day_date_1=`date -d "$last_date" '+%j'`      # numero del dia del anio de la fecha en "last_date"
-	local y1=`date -d "$last_date" '+%Y'`
-	local day_date_2=`date -d "$stop_date" '+%j'`
-	local y2=`date -d "$stop_date" '+%Y'`
-	if [ $y1 -eq $y2 ]  # si $y1 es igual a $y2
-	then
-		local num_days=$(expr $day_date_2 - $day_date_1)
-	elif [ $y1 -lt $y2 ]
-	then
-		local days_date_1=0
-		for year in `seq $y1 $y2`
-		do
-			if [ $year -eq $y1 ]
-			then
-				local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
-			elif [ $year -eq $y2 ]
-			then
-				days_date=$day_date_2
-			else
-				days_date=`date -d "$year-12-31" '+%j'`
-			fi
-			days_date_1=$(expr $days_date_1 + $days_date)
-		done
-		local num_days=$(expr $days_date_1)
-	fi
-	return $num_days
-}
-## ----------------------------------------------------------------------------
-## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
-for section in ${site_section[@]}
-do
-	if [ $section = otros_sitios ]
-	then
-		list=${other_site_list[@]}
-	else
-		list=${yuc_site_list[@]}
-	fi
-	for site in $list
-	do
-		## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
-		cd crawledJsonFiles/$section/$site
-		max=`ls | tail -1`           # obtiene el ultimo directorio
-		cd $max
-		json_file=`ls | tail -1`     # obtiene el ultimo archivo dentro del directorio
-		## ----------------------------------------------------------------------------------------
-		cd ~/crawler
-		last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
-		stop_date=`date -d "now" '+%Y-%m-%d'`        # descarga hasta una fecha antes de esta
-		## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
-		if [ $last_date != $stop_date ]
-		then
-			last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`                  
-			## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
-			obtain_days $last_date $stop_date    # parametros que se pasan a la funcion
-			num_days=$?                          # retorno del valor por parte de la funcion 'obtain_days'
-			for i in `seq $num_days -1 1`
-			do
-				y=`date -d "$stop_date - $i days" '+%Y'`
-				m=`date -d "$stop_date - $i days" '+%m'`
-				d=`date -d "$stop_date - $i days" '+%d'`
-				cd cawlersNoticias/$section/$site/       # ruta donde se encuentran alojados los crawlers de cada sitio
-				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
-				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json    # revisa si el archivo contiene informacion, sino lo elimina
-				if [ -e $y-$m-$d.json ]                  # revisa si se genero el archivo json con las noticias
-				then
-					destination=$HOME/crawler/prueba/$section/$site/$y/      # ruta donde se guardaran los json generados
-					if [ ! -d $destination ]             # si no existe la ruta de destino la crea
-					then
-						mkdir -p $destination
-					fi
-					mv -f $y-$m-$d.json $destination     # mueve el archivo json a la ruta de destino
-				fi
-				cd ~/crawler
-			done
-		fi
-	done
-done
-deactivate
-## ------------------------------------------------------------------------------------------------
\ No newline at end of file
--- a/crawler_script/download_backwards.py
+++ b/crawler_script/download_backwards.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga constante de histórico de medios del tipo "descarga_hacia_atras".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        else:
-            year = today.date().year
-        print year
-        try:
-            os.makedirs(str(year))
-        except:
-            print "ok"
-        os.chdir(str(year))
-        lstDays = os.listdir(".")
-        lstDays = [l for l in lstDays if not l.startswith('.')]
-        lstDays.sort()
-        print lstDays
-        filename = "noticias.json"
-        if len(lstDays) > 0:
-            strDate = lstDays[len(lstDays)-1]
-            print strDate
-            strDate = strDate[:strDate.find(".")]
-            currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
-        else:
-            scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-        mydir = os.getcwd()
-        print mydir
-        os.chdir(scrapyDir+s['crawler'])
-        print media
-        print scrapycommand
-        os.system(scrapycommand)
-        fileSize = os.stat(filename).st_size
-        if fileSize <= 3: os.system("rm " + filename)
-        else:
-            os.chdir(scrapyDir)
-            os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
-            os.chdir(media)
-            mediaYears = os.listdir(".")
-            mediaYears.sort()
-            for yy in mediaYears:
-                os.chdir(yy)
-                try:
-                   os.makedirs(baseDir + media + "/" + yy)
-                except:
-                    pass
-                mediaDays = os.listdir(".")
-                mediaDays = [l for l in mediaDays if not l.startswith('.')]
-                mediaDays.sort()
-                for dd in mediaDays:
-                    os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-                os.chdir("..")
-                os.system("rm -R " + yy)
-            os.chdir("..")
-            os.system("rm -R " + media)
-            os.chdir(s['crawler'])
-            os.system("rm " + filename)
-            os.chdir(mydir)
-        os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/crawler_script/download_by_day.py
+++ b/crawler_script/download_by_day.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga constante de histórico de medios del tipo "descarga_por_dia".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:    
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        desde =  datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
-        print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        year = desde.year
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        for y in range(year, today.year+1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-#			print os.getcwd()
-            lstDays = os.listdir(".")
-            lstDays = [l for l in lstDays if not l.startswith('.')]
-            lstDays.sort()
-            print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde
-            if len(lstDays) > 0:
-                strDate = lstDays[len(lstDays)-1]
-                strDate = strDate[:strDate.find(".")]
-                currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-                day = currentDate.timetuple().tm_yday
-            elif y != desde.year:
-                currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
-                day = 1
-            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
-                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir+s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3: os.system("rm " + filename)
-                else: os.system("mv " + filename + " " + mydir)
-                os.chdir(mydir)
-                currentDate = currentDate + datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/crawler_script/download_often.py
+++ b/crawler_script/download_often.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga automatica de noticias por rss
-"""
-import sys
-import json
-import os
-import datetime
-from collections import OrderedDict
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    return row
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        # desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
-        desde = today
-        print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/") + 1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        CORRECT_YEAR = False
-        while not CORRECT_YEAR:
-            lstYears = os.listdir(".")
-            lstYears.sort()
-            if len(lstYears) > 0:
-                element = lstYears[len(lstYears) - 1]
-                if element[-4:] == 'json':
-                    os.system('rm ' + element)
-                else:
-                    CORRECT_YEAR = True
-            else:
-                break
-        if CORRECT_YEAR: year = int(element)
-        else: year = desde.year
-        for y in range(year, today.year + 1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-            #			print os.getcwd()
-            lstDays = os.listdir(".")
-            lstDays = [l for l in lstDays if not l.startswith('.')]
-            lstDays.sort()
-            print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde.date()
-            # if len(lstDays) > 0:
-            #     strDate = lstDays[len(lstDays) - 1]
-            #     strDate = strDate[:strDate.find(".")]
-            #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            #     day = currentDate.timetuple().tm_yday
-            # elif y != desde.year:
-            #     currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
-            #     day = 1
-            for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
-                YESTERDAY = False
-                filename = currentDate.strftime('%Y-%m-%d') + ".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir + s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3:
-                    os.system("rm " + filename)
-                else:
-                    f1 = mydir + "/" + filename
-                    f2 = filename
-                    f3 = baseDir + media + "/" + filename
-                    try:
-                        with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
-                            master = json.load(infile1)
-                            slave = json.load(infile2)
-                            urlSet = set([line['url'] for line in master])
-                            counter = 0
-                            infile3.write("[")
-                            for line in master:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if lineDate == currentDate:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    elif counter > 1:
-                                        infile3.write(",\n" + json.dumps(row))
-                            for line in slave:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if not line['url'] in urlSet and lineDate == currentDate:
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    infile3.write(",\n" + json.dumps(row))
-                                elif lineDate != currentDate and (currentDate - lineDate).days == 1:
-                                    YESTERDAY = True
-                            infile3.write("]")
-                        os.system("mv " + f3 + " " + mydir)
-                        # os.system("rm " + f2)
-                    except:
-                        os.system("cp " + f2 + " " + mydir)
-                    if YESTERDAY:
-                        currentDate -= datetime.timedelta(days=1)
-                        filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
-                        f1 = mydir + '/' + filenameYesterday
-                        f2 = filename
-                        f3 = baseDir + media + '/' + filenameYesterday
-                        with open(f2) as infile2, open(f3, 'a') as infile3:
-                            try:
-                                infile1 = open(f1)
-                                master = json.load(infile1)
-                                yesterdayFlag = True
-                            except:
-                                yesterdayFlag = False
-                            urlSet = set()
-                            slave = json.load(infile2)
-                            infile3.write("[")
-                            if yesterdayFlag:
-                                urlSet = set([line['url'] for line in master])
-                                counter = 0
-                                for line in master:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    elif counter > 1:
-                                        infile3.write(",\n" + json.dumps(row))
-                            counter = 0
-                            for line in slave:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if not line['url'] in urlSet and lineDate == currentDate:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if not yesterdayFlag and counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    else:
-                                        infile3.write(",\n" + json.dumps(row))
-                            infile3.write("]")
-                            if yesterdayFlag: infile1.close()
-                        os.system("mv " + f3 + " " + mydir)
-                    os.system("rm " + f2)
-                os.chdir(mydir)
-                if YESTERDAY:
-                    currentDate += datetime.timedelta(days=2)
-                else:
-                    currentDate += datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/crawler_script/tracker_backwards.py
+++ b/crawler_script/tracker_backwards.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para el trackeo de histórico de medios del tipo "descarga_hacia_atras".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        else:
-            year = today.date().year
-        print year
-        try:
-            os.makedirs(str(year))
-        except:
-            print "ok"
-        os.chdir(str(year))
-        lstDays = os.listdir(".")
-        lstDays = [l for l in lstDays if not l.startswith('.')]
-        lstDays.sort()
-        print lstDays
-        filename = "news.json"
-        # if len(lstDays) > 0:
-        #     strDate = lstDays[len(lstDays)-1]
-        #     print strDate
-        #     strDate = strDate[:strDate.find(".")]
-        #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-        #     scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
-        #
-        # else:
-        scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-        mydir = os.getcwd()
-        print mydir
-        os.chdir(scrapyDir+s['crawler'])
-        print media
-        print scrapycommand
-        os.system(scrapycommand)
-        fileSize = os.stat(filename).st_size
-        if fileSize <= 3: os.system("rm " + filename)
-        else:
-            os.chdir(scrapyDir)
-            os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
-            os.chdir(media)
-            mediaYears = os.listdir(".")
-            mediaYears.sort()
-            for yy in mediaYears:
-                os.chdir(yy)
-                try:
-                   os.makedirs(baseDir + media + "/" + yy)
-                except:
-                    pass
-                mediaDays = os.listdir(".")
-                mediaDays = [l for l in mediaDays if not l.startswith('.')]
-                mediaDays.sort()
-                for dd in mediaDays:
-                    os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-                os.chdir("..")
-                os.system("rm -R " + yy)
-            os.chdir("..")
-            os.system("rm -R " + media)
-            os.chdir(s['crawler'])
-            # os.system("rm " + filename)
-            os.chdir(mydir)
-        os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/crawler_script/tracker_by_day.py
+++ b/crawler_script/tracker_by_day.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-Descarga las noticias de un sitio desde entre dos fechas especificas para medios del tipo 'descarga_por_dia'.
-USO:
-tracker.py data.json
-"""
-import sys
-import json
-import os
-import datetime
-# today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        desde = datetime.datetime.strptime(s['desde'], "%d-%m-%Y")
-        hasta = datetime.datetime.strptime(s['hasta'], "%d-%m-%Y")
-        print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        # lstYears = os.listdir(".")
-        # lstYears.sort()
-        year = desde.year
-        # if len(lstYears) > 0:
-        #     year = int(lstYears[len(lstYears)-1])
-        for y in range(year, hasta.year+1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-#			print os.getcwd()
-            # lstDays = os.listdir(".")
-            # lstDays = [l for l in lstDays if not l.startswith('.')]
-            # lstDays.sort()
-            # print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde
-            # if len(lstDays) > 0:
-            #     strDate = lstDays[len(lstDays)-1]
-            #     strDate = strDate[:strDate.find(".")]
-            #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            #     day = currentDate.timetuple().tm_yday
-            # elif y != desde.year:
-            if y != desde.year:
-                currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
-                day = 1
-            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1):
-                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir+s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3: os.system("rm " + filename)
-                else: os.system("mv " + filename + " " + mydir)
-                os.chdir(mydir)
-                currentDate = currentDate + datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-# print hasta.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/crawler_script/tracker_proceso.py
+++ b/crawler_script/tracker_proceso.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para el trackeo del medio PROCESO.
-"""
-import sys
-import os
-baseDir   = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-s = {"crawler": "descarga_por_mes/proceso"}
-media = s['crawler'][s['crawler'].rfind("/")+1:]
-os.chdir(baseDir)
-try:
-    os.makedirs(media)
-except:
-    print "ok"
-os.chdir(media)
-# rango va de 1976 a 2018
-for year in xrange(1976, 2019):
-    try:
-        os.makedirs(str(year))
-    except:
-        print "ok"
-    os.chdir(str(year))
-    filename = str(year) + ".json"
-    scrapycommand = "scrapy crawl noticias --nolog -s filename={0} -a year={1}".format(filename, str(year))
-    mydir = os.getcwd()
-    print mydir
-    os.chdir(scrapyDir + s['crawler'])
-    print media
-    print scrapycommand
-    os.system(scrapycommand)
-    fileSize = os.stat(filename).st_size
-    if fileSize <= 3: os.system("rm " + filename)
-    else:
-        os.chdir(scrapyDir)
-        sys_command = "python3 parse_date_files.py {0} {1}".format(s['crawler'], filename)
-        os.system(sys_command)
-        os.chdir(media)
-        mediaYears = os.listdir(".")
-        mediaYears.sort()
-        for yy in mediaYears:
-            os.chdir(yy)
-            try:
-                os.makedirs(baseDir + media + "/" + yy)
-            except:
-                pass
-            mediaDays = os.listdir(".")
-            mediaDays = [l for l in mediaDays if not l.startswith('.')]
-            mediaDays.sort()
-            for dd in mediaDays:
-                os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-            os.chdir("..")
-            os.system("rm -R " + yy)
-        os.chdir("..")
-        os.system("rm -R " + media)
-        os.chdir(s['crawler'])
-        os.system("rm " + filename)
-        os.chdir(mydir)
-    os.chdir("..")
-    # os.chdir("..")
--- a/especs_sitio_proceso.txt
+++ b/especs_sitio_proceso.txt
-Para el crawler de la seccion 'Hemeroteca' del sitio 'Proceso' se requirio lo siguiente (a parte de scrapy):
-docker
-splash
-scrapy-splash
-=================================================
-Instalacion DOCKER, ubuntu 16.04
-=================================================
-	$ sudo apt-get update
-	$ sudo apt-get upgrade
-	$ sudo apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D
-	$ sudo apt-add-repository 'deb https://apt.dockerproject.org/repo ubuntu-xenial main'
-	$ sudo apt-get update
-	$ sudo apt-get install docker-engine
-> Inicializar/detener Docker:
-	$ sudo service docker start/stop
-> Version:
-	$ docker --version
-Consulta: https://thishosting.rocks/install-docker-on-ubuntu/
-=================================================
-Instalacion SPLASH
-=================================================
-	$ sudo docker pull scrapinghub/splash
-> Inicializar el contenedor:
-	$ sudo docker run -p 8050:8050 scrapinghub/splash     ## con esto splash esta disponible en puerto 
-	                                                      ## 8050 (http) en navegador (localhost:8050)
-Consulta: http://splash.readthedocs.io/en/latest/install.html
-=================================================
-Instalacion scrapy-splash
-=================================================
-	$ pip install scrapy-splash
-> Configuraciones en settings.py:
-	* Para usar scrapy-splash en un proyecto primero se necesita habilitar el middleware:
-		DOWNLOADER_MIDDLEWARES = {
-			'scrapy_splash.SplashCookiesMiddleware': 723,
-			'scrapy_splash.SplashMiddleware': 725,
-			'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
-		}
-	* Agregar el SPLASH_URL:
-		SPLASH_URL = 'http://localhost:8050/'
-	* Habilitar SplashDeduplicateArgsMiddleware, que permite ahorrar espacio en disco evitando almacenar multiples argumentos duplicados de Splash.
-		SPIDER_MIDDLEWARES = {
-			'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
-		}
-	* Configurar DUPEFILTER_CLASS y almacenamiento chache:
-		DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
-		HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
-Consulta: https://blog.scrapinghub.com/2015/03/02/handling-javascript-in-scrapy-with-splash/
-					https://github.com/scrapy-plugins/scrapy-splash
--- a/parse_date_files.py
+++ b/parse_date_files.py
-# -*- coding: utf-8 -*-
-import json, os, sys
-from datetime import datetime
-from collections import OrderedDict
-"""
-Toma como entrada un archivo json que contiene noticias con diferentes fechas.
-Devuelve las noticias en carpetas separadas por año.
-Uso:
-python parse_date_files.py <ruta_del_crawler> <nombre_archivo>
-Ej.
-python parse_date_files.py descarga_hacia_atras/laJornadaBC2 noticias.json
-"""
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    return row
-info = sys.argv[1]
-news_file = sys.argv[2]
-media = info[info.rfind("/") + 1:]
-download_type = info[:info.rfind("/")]
-this_file_path = os.path.dirname(os.path.realpath(__file__))
-json_file_path = this_file_path + "/" + download_type + "/" + media
-destination_path = this_file_path + "/" + media
-json_file = json.loads(open(json_file_path + "/" + news_file).read())
-date_set = set()
-for news in json_file:
-    if news['date'] is not None:
-        news_date = news['date'][:news['date'].rfind("T")]
-        if len(news_date) > 10:
-            news_date = news['date'][:news['date'].rfind(' ')]
-        if not news_date in date_set:
-            date_set.add(news_date)
-            print(news_date)
-            urlSet = set()
-            try:
-                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
-            except:
-                os.makedirs(destination_path + "/" + news_date[:4])
-                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
-            counter = 0
-            export_file.write("[")
-            for line in json_file:
-                if line['date'] is not None:
-                    line_date = line['date'][:line['date'].rfind("T")]
-                    if len(line_date) > 10:
-                        line_date = line['date'][:line['date'].rfind(' ')]
-                    if not line['url'] in urlSet and line_date == news_date:
-                        urlSet.add(line['url'])
-                        counter += 1
-                        auxRow = dictRowGenerator(line)
-                        row = OrderedDict(auxRow)
-                        if counter == 1:
-                            export_file.write(json.dumps(row))
-                        elif counter > 1:
-                            export_file.write(",\n" + json.dumps(row))
-            export_file.write("]")
-            export_file.close()
--- a/parse_date_files2.py
+++ b/parse_date_files2.py
-# -*- coding: utf-8 -*-
-import json, os, sys
-from datetime import datetime
-from collections import OrderedDict
-"""
-Parseo de fechas para las noticias descargadas del tipo 'descarga_hacia_atras'
-Uso:
-python parse_date_files.py <nombre_del_crawler>
-Ej.
-python parse_date_files.py laJornadaBC2
-"""
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    return row
-download_type = 'descarga_hacia_atras'
-this_file_path = os.path.dirname(os.path.realpath(__file__))
-json_file_path = this_file_path+'/'+download_type+'/'+sys.argv[1]
-destination_path = this_file_path+'/'+sys.argv[1]
-json_file = json.loads(open(json_file_path+'/noticias.json').read())
-date_set = set()
-urlSet = set()
-for news in json_file:
-    if news['date'] is not None:
-        news_date = news['date'][:news['date'].rfind('T')]
-        if len(news_date) > 10:
-            news_date = news['date'][:news['date'].rfind(' ')]
-        if not news_date in date_set:
-            date_set.add(news_date)
-            print(news_date)
-            try:
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
-            except:
-                os.makedirs(destination_path+'/'+news_date[:4])
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
-            counter = 0
-            export_file.write("[")
-            for line in json_file:
-                if line['date'] is not None:
-                    line_date = line['date'][:line['date'].rfind('T')]
-                    if len(line_date) > 10:
-                        line_date = line['date'][:line['date'].rfind(' ')]
-                    if not line['url'] in urlSet and line_date == news_date:
-                        urlSet.add(line['url'])
-                        counter += 1
-                        auxRow = dictRowGenerator(line)
-                        row = OrderedDict(auxRow)
-                        if counter == 1:
-                            export_file.write(json.dumps(row))
-                        elif counter > 1:
-                            export_file.write(",\n" + json.dumps(row))
-            export_file.write("]")
-            export_file.close()
--- a/scripts/spidersTools.py
+++ b/scripts/spidersTools.py
@@ -26,7 +26,12 @@ def findLastDate(directory):
 	return None
 #===============================================================================
-def updateDir(directory, cfg, endDate=datetime.datetime.now()):
+def crawlNews(directory, cfg, endDate=datetime.datetime.now()):
+	'''
+		directory: directory's absolute path where json files are going to be stored, it must contain a 
+	'''
 	startDate = findLastDate(directory)
 	print(startDate, endDate)
 #	endDate = datetime.datetime.now()
@@ -47,7 +52,10 @@ def updateDir(directory, cfg, endDate=datetime.datetime.now()):
 #===============================================================================
 def main(argv):
+	'''
+	'''
 	if len(argv) != 2 and len(argv) != 3:
 		print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
 	else:
@@ -55,9 +63,9 @@ def main(argv):
 			cfg = json.load(json_file)
 		if len(argv)==2:
-			updateDir(argv[1], cfg)
+			crawlNews(argv[1], cfg)
 		if len(argv)==3:
-			updateDir(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d'))
+			crawlNews(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d'))
 if __name__ == "__main__":
 	main(sys.argv)
--- a/scripts/tmp/check_empty_file.py
+++ b/scripts/tmp/check_empty_file.py
-import os
-newsDir = '/home/geoint/virtualHDD/m3/noticias'
-# newsDir = '/home/cna_service/noticias/'
-os.chdir(newsDir)
-mediaLst = os.listdir('.')
-mediaLst.sort()
-vacios_txt = open('vacios.txt','w')
-empty_count = 0
-for media in mediaLst:
-	os.chdir(media)
-	yearLst = os.listdir('.')
-	yearLst.sort()
-	for year in yearLst:
-		os.chdir(year)
-		fileLst = os.listdir('.')
-		fileLst.sort()
-		for file in fileLst:
-			fileSize = os.stat(file).st_size
-			if not file.startswith('.') and fileSize <= 3:
-				empty_count += 1
-				if empty_count == 1:
-					vacios_txt.write(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				else:
-					vacios_txt.write('\n'+media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				print(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
-				os.system('rm '+file)
-		os.chdir('..')
-	os.chdir('..')
-vacios_txt.write('\n'+'Total archivos vacios: '+str(empty_count)+'\n')
-vacios_txt.close()
-print('Total archivos vacios: '+str(empty_count))
\ No newline at end of file
--- a/scripts/tmp/crawl_all.sh
+++ b/scripts/tmp/crawl_all.sh
-#!/bin/bash
-python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
--- a/scripts/tmp/crawler.sh
+++ b/scripts/tmp/crawler.sh
-#!/bin/bash
-## ------------------------------------------------------------------
-## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
-## ------------------------------------------------------------------
-site_section=( otros_sitios sitios_yucatan )
-other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
-yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
-base_path=$HOME/crawler/
-cd $base_path                    # activando el venv (entorno virtual) del crawler
-source bin/activate
-PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy       # rutas donde se encuentran el scrapy y python
-export PATH
-## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
-function obtain_days() {
-	local last_date=$1          # parametro 1
-	local stop_date=$2	        # parametro 2
-	local day_date_1=`date -d "$last_date" '+%j'`      # numero del dia del anio de la fecha en "last_date"
-	local y1=`date -d "$last_date" '+%Y'`
-	local day_date_2=`date -d "$stop_date" '+%j'`
-	local y2=`date -d "$stop_date" '+%Y'`
-	if [ $y1 -eq $y2 ]  # si $y1 es igual a $y2
-	then
-		local num_days=$(expr $day_date_2 - $day_date_1)
-	elif [ $y1 -lt $y2 ]
-	then
-		local days_date_1=0
-		for year in `seq $y1 $y2`
-		do
-			if [ $year -eq $y1 ]
-			then
-				local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
-			elif [ $year -eq $y2 ]
-			then
-				days_date=$day_date_2
-			else
-				days_date=`date -d "$year-12-31" '+%j'`
-			fi
-			days_date_1=$(expr $days_date_1 + $days_date)
-		done
-		local num_days=$(expr $days_date_1)
-	fi
-	return $num_days
-}
-## ----------------------------------------------------------------------------
-## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
-for section in ${site_section[@]}
-do
-	if [ $section = otros_sitios ]
-	then
-		list=${other_site_list[@]}
-	else
-		list=${yuc_site_list[@]}
-	fi
-	for site in $list
-	do
-		## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
-		cd crawledJsonFiles/$section/$site
-		max=`ls | tail -1`           # obtiene el ultimo directorio
-		cd $max
-		json_file=`ls | tail -1`     # obtiene el ultimo archivo dentro del directorio
-		## ----------------------------------------------------------------------------------------
-		cd ~/crawler
-		last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
-		stop_date=`date -d "now" '+%Y-%m-%d'`        # descarga hasta una fecha antes de esta
-		## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
-		if [ $last_date != $stop_date ]
-		then
-			last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`                  
-			## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
-			obtain_days $last_date $stop_date    # parametros que se pasan a la funcion
-			num_days=$?                          # retorno del valor por parte de la funcion 'obtain_days'
-			for i in `seq $num_days -1 1`
-			do
-				y=`date -d "$stop_date - $i days" '+%Y'`
-				m=`date -d "$stop_date - $i days" '+%m'`
-				d=`date -d "$stop_date - $i days" '+%d'`
-				cd cawlersNoticias/$section/$site/       # ruta donde se encuentran alojados los crawlers de cada sitio
-				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
-				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json    # revisa si el archivo contiene informacion, sino lo elimina
-				if [ -e $y-$m-$d.json ]                  # revisa si se genero el archivo json con las noticias
-				then
-					destination=$HOME/crawler/prueba/$section/$site/$y/      # ruta donde se guardaran los json generados
-					if [ ! -d $destination ]             # si no existe la ruta de destino la crea
-					then
-						mkdir -p $destination
-					fi
-					mv -f $y-$m-$d.json $destination     # mueve el archivo json a la ruta de destino
-				fi
-				cd ~/crawler
-			done
-		fi
-	done
-done
-deactivate
-## ------------------------------------------------------------------------------------------------
\ No newline at end of file
--- a/scripts/tmp/download_backwards.py
+++ b/scripts/tmp/download_backwards.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga constante de histórico de medios del tipo "descarga_hacia_atras".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        else:
-            year = today.date().year
-        print year
-        try:
-            os.makedirs(str(year))
-        except:
-            print "ok"
-        os.chdir(str(year))
-        lstDays = os.listdir(".")
-        lstDays = [l for l in lstDays if not l.startswith('.')]
-        lstDays.sort()
-        print lstDays
-        filename = "noticias.json"
-        if len(lstDays) > 0:
-            strDate = lstDays[len(lstDays)-1]
-            print strDate
-            strDate = strDate[:strDate.find(".")]
-            currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
-        else:
-            scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-        mydir = os.getcwd()
-        print mydir
-        os.chdir(scrapyDir+s['crawler'])
-        print media
-        print scrapycommand
-        os.system(scrapycommand)
-        fileSize = os.stat(filename).st_size
-        if fileSize <= 3: os.system("rm " + filename)
-        else:
-            os.chdir(scrapyDir)
-            os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
-            os.chdir(media)
-            mediaYears = os.listdir(".")
-            mediaYears.sort()
-            for yy in mediaYears:
-                os.chdir(yy)
-                try:
-                   os.makedirs(baseDir + media + "/" + yy)
-                except:
-                    pass
-                mediaDays = os.listdir(".")
-                mediaDays = [l for l in mediaDays if not l.startswith('.')]
-                mediaDays.sort()
-                for dd in mediaDays:
-                    os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-                os.chdir("..")
-                os.system("rm -R " + yy)
-            os.chdir("..")
-            os.system("rm -R " + media)
-            os.chdir(s['crawler'])
-            os.system("rm " + filename)
-            os.chdir(mydir)
-        os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/scripts/tmp/download_by_day.py
+++ b/scripts/tmp/download_by_day.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga constante de histórico de medios del tipo "descarga_por_dia".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:    
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        desde =  datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
-        print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        year = desde.year
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        for y in range(year, today.year+1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-#			print os.getcwd()
-            lstDays = os.listdir(".")
-            lstDays = [l for l in lstDays if not l.startswith('.')]
-            lstDays.sort()
-            print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde
-            if len(lstDays) > 0:
-                strDate = lstDays[len(lstDays)-1]
-                strDate = strDate[:strDate.find(".")]
-                currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-                day = currentDate.timetuple().tm_yday
-            elif y != desde.year:
-                currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
-                day = 1
-            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
-                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir+s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3: os.system("rm " + filename)
-                else: os.system("mv " + filename + " " + mydir)
-                os.chdir(mydir)
-                currentDate = currentDate + datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/scripts/tmp/download_often.py
+++ b/scripts/tmp/download_often.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para la descarga automatica de noticias por rss
-"""
-import sys
-import json
-import os
-import datetime
-from collections import OrderedDict
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    return row
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        # desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
-        desde = today
-        print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/") + 1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        CORRECT_YEAR = False
-        while not CORRECT_YEAR:
-            lstYears = os.listdir(".")
-            lstYears.sort()
-            if len(lstYears) > 0:
-                element = lstYears[len(lstYears) - 1]
-                if element[-4:] == 'json':
-                    os.system('rm ' + element)
-                else:
-                    CORRECT_YEAR = True
-            else:
-                break
-        if CORRECT_YEAR: year = int(element)
-        else: year = desde.year
-        for y in range(year, today.year + 1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-            #			print os.getcwd()
-            lstDays = os.listdir(".")
-            lstDays = [l for l in lstDays if not l.startswith('.')]
-            lstDays.sort()
-            print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde.date()
-            # if len(lstDays) > 0:
-            #     strDate = lstDays[len(lstDays) - 1]
-            #     strDate = strDate[:strDate.find(".")]
-            #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            #     day = currentDate.timetuple().tm_yday
-            # elif y != desde.year:
-            #     currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
-            #     day = 1
-            for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
-                YESTERDAY = False
-                filename = currentDate.strftime('%Y-%m-%d') + ".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir + s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3:
-                    os.system("rm " + filename)
-                else:
-                    f1 = mydir + "/" + filename
-                    f2 = filename
-                    f3 = baseDir + media + "/" + filename
-                    try:
-                        with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
-                            master = json.load(infile1)
-                            slave = json.load(infile2)
-                            urlSet = set([line['url'] for line in master])
-                            counter = 0
-                            infile3.write("[")
-                            for line in master:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if lineDate == currentDate:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    elif counter > 1:
-                                        infile3.write(",\n" + json.dumps(row))
-                            for line in slave:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if not line['url'] in urlSet and lineDate == currentDate:
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    infile3.write(",\n" + json.dumps(row))
-                                elif lineDate != currentDate and (currentDate - lineDate).days == 1:
-                                    YESTERDAY = True
-                            infile3.write("]")
-                        os.system("mv " + f3 + " " + mydir)
-                        # os.system("rm " + f2)
-                    except:
-                        os.system("cp " + f2 + " " + mydir)
-                    if YESTERDAY:
-                        currentDate -= datetime.timedelta(days=1)
-                        filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
-                        f1 = mydir + '/' + filenameYesterday
-                        f2 = filename
-                        f3 = baseDir + media + '/' + filenameYesterday
-                        with open(f2) as infile2, open(f3, 'a') as infile3:
-                            try:
-                                infile1 = open(f1)
-                                master = json.load(infile1)
-                                yesterdayFlag = True
-                            except:
-                                yesterdayFlag = False
-                            urlSet = set()
-                            slave = json.load(infile2)
-                            infile3.write("[")
-                            if yesterdayFlag:
-                                urlSet = set([line['url'] for line in master])
-                                counter = 0
-                                for line in master:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    elif counter > 1:
-                                        infile3.write(",\n" + json.dumps(row))
-                            counter = 0
-                            for line in slave:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
-                                if not line['url'] in urlSet and lineDate == currentDate:
-                                    counter += 1
-                                    auxRow = dictRowGenerator(line)
-                                    row = OrderedDict(auxRow)
-                                    if not yesterdayFlag and counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    else:
-                                        infile3.write(",\n" + json.dumps(row))
-                            infile3.write("]")
-                            if yesterdayFlag: infile1.close()
-                        os.system("mv " + f3 + " " + mydir)
-                    os.system("rm " + f2)
-                os.chdir(mydir)
-                if YESTERDAY:
-                    currentDate += datetime.timedelta(days=2)
-                else:
-                    currentDate += datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/scripts/tmp/especs_sitio_proceso.txt
+++ b/scripts/tmp/especs_sitio_proceso.txt
-Para el crawler de la seccion 'Hemeroteca' del sitio 'Proceso' se requirio lo siguiente (a parte de scrapy):
-docker
-splash
-scrapy-splash
-=================================================
-Instalacion DOCKER, ubuntu 16.04
-=================================================
-	$ sudo apt-get update
-	$ sudo apt-get upgrade
-	$ sudo apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys 58118E89F3A912897C070ADBF76221572C52609D
-	$ sudo apt-add-repository 'deb https://apt.dockerproject.org/repo ubuntu-xenial main'
-	$ sudo apt-get update
-	$ sudo apt-get install docker-engine
-> Inicializar/detener Docker:
-	$ sudo service docker start/stop
-> Version:
-	$ docker --version
-Consulta: https://thishosting.rocks/install-docker-on-ubuntu/
-=================================================
-Instalacion SPLASH
-=================================================
-	$ sudo docker pull scrapinghub/splash
-> Inicializar el contenedor:
-	$ sudo docker run -p 8050:8050 scrapinghub/splash     ## con esto splash esta disponible en puerto 
-	                                                      ## 8050 (http) en navegador (localhost:8050)
-Consulta: http://splash.readthedocs.io/en/latest/install.html
-=================================================
-Instalacion scrapy-splash
-=================================================
-	$ pip install scrapy-splash
-> Configuraciones en settings.py:
-	* Para usar scrapy-splash en un proyecto primero se necesita habilitar el middleware:
-		DOWNLOADER_MIDDLEWARES = {
-			'scrapy_splash.SplashCookiesMiddleware': 723,
-			'scrapy_splash.SplashMiddleware': 725,
-			'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
-		}
-	* Agregar el SPLASH_URL:
-		SPLASH_URL = 'http://localhost:8050/'
-	* Habilitar SplashDeduplicateArgsMiddleware, que permite ahorrar espacio en disco evitando almacenar multiples argumentos duplicados de Splash.
-		SPIDER_MIDDLEWARES = {
-			'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
-		}
-	* Configurar DUPEFILTER_CLASS y almacenamiento chache:
-		DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
-		HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
-Consulta: https://blog.scrapinghub.com/2015/03/02/handling-javascript-in-scrapy-with-splash/
-					https://github.com/scrapy-plugins/scrapy-splash
--- a/scripts/tmp/parse_date_files.py
+++ b/scripts/tmp/parse_date_files.py
-# -*- coding: utf-8 -*-
-import json, os, sys
-from datetime import datetime
-from collections import OrderedDict
-"""
-Toma como entrada un archivo json que contiene noticias con diferentes fechas.
-Devuelve las noticias en carpetas separadas por año.
-Uso:
-python parse_date_files.py <ruta_del_crawler> <nombre_archivo>
-Ej.
-python parse_date_files.py descarga_hacia_atras/laJornadaBC2 noticias.json
-"""
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    return row
-info = sys.argv[1]
-news_file = sys.argv[2]
-media = info[info.rfind("/") + 1:]
-download_type = info[:info.rfind("/")]
-this_file_path = os.path.dirname(os.path.realpath(__file__))
-json_file_path = this_file_path + "/" + download_type + "/" + media
-destination_path = this_file_path + "/" + media
-json_file = json.loads(open(json_file_path + "/" + news_file).read())
-date_set = set()
-for news in json_file:
-    if news['date'] is not None:
-        news_date = news['date'][:news['date'].rfind("T")]
-        if len(news_date) > 10:
-            news_date = news['date'][:news['date'].rfind(' ')]
-        if not news_date in date_set:
-            date_set.add(news_date)
-            print(news_date)
-            urlSet = set()
-            try:
-                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
-            except:
-                os.makedirs(destination_path + "/" + news_date[:4])
-                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
-            counter = 0
-            export_file.write("[")
-            for line in json_file:
-                if line['date'] is not None:
-                    line_date = line['date'][:line['date'].rfind("T")]
-                    if len(line_date) > 10:
-                        line_date = line['date'][:line['date'].rfind(' ')]
-                    if not line['url'] in urlSet and line_date == news_date:
-                        urlSet.add(line['url'])
-                        counter += 1
-                        auxRow = dictRowGenerator(line)
-                        row = OrderedDict(auxRow)
-                        if counter == 1:
-                            export_file.write(json.dumps(row))
-                        elif counter > 1:
-                            export_file.write(",\n" + json.dumps(row))
-            export_file.write("]")
-            export_file.close()
--- a/scripts/tmp/parse_date_files2.py
+++ b/scripts/tmp/parse_date_files2.py
-# -*- coding: utf-8 -*-
-import json, os, sys
-from datetime import datetime
-from collections import OrderedDict
-"""
-Parseo de fechas para las noticias descargadas del tipo 'descarga_hacia_atras'
-Uso:
-python parse_date_files.py <nombre_del_crawler>
-Ej.
-python parse_date_files.py laJornadaBC2
-"""
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    return row
-download_type = 'descarga_hacia_atras'
-this_file_path = os.path.dirname(os.path.realpath(__file__))
-json_file_path = this_file_path+'/'+download_type+'/'+sys.argv[1]
-destination_path = this_file_path+'/'+sys.argv[1]
-json_file = json.loads(open(json_file_path+'/noticias.json').read())
-date_set = set()
-urlSet = set()
-for news in json_file:
-    if news['date'] is not None:
-        news_date = news['date'][:news['date'].rfind('T')]
-        if len(news_date) > 10:
-            news_date = news['date'][:news['date'].rfind(' ')]
-        if not news_date in date_set:
-            date_set.add(news_date)
-            print(news_date)
-            try:
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
-            except:
-                os.makedirs(destination_path+'/'+news_date[:4])
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
-            counter = 0
-            export_file.write("[")
-            for line in json_file:
-                if line['date'] is not None:
-                    line_date = line['date'][:line['date'].rfind('T')]
-                    if len(line_date) > 10:
-                        line_date = line['date'][:line['date'].rfind(' ')]
-                    if not line['url'] in urlSet and line_date == news_date:
-                        urlSet.add(line['url'])
-                        counter += 1
-                        auxRow = dictRowGenerator(line)
-                        row = OrderedDict(auxRow)
-                        if counter == 1:
-                            export_file.write(json.dumps(row))
-                        elif counter > 1:
-                            export_file.write(",\n" + json.dumps(row))
-            export_file.write("]")
-            export_file.close()
--- a/scripts/tmp/tracker_backwards.py
+++ b/scripts/tmp/tracker_backwards.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para el trackeo de histórico de medios del tipo "descarga_hacia_atras".
-"""
-import sys
-import json
-import os
-import datetime
-today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        lstYears = os.listdir(".")
-        lstYears.sort()
-        if len(lstYears) > 0:
-            year = int(lstYears[len(lstYears)-1])
-        else:
-            year = today.date().year
-        print year
-        try:
-            os.makedirs(str(year))
-        except:
-            print "ok"
-        os.chdir(str(year))
-        lstDays = os.listdir(".")
-        lstDays = [l for l in lstDays if not l.startswith('.')]
-        lstDays.sort()
-        print lstDays
-        filename = "news.json"
-        # if len(lstDays) > 0:
-        #     strDate = lstDays[len(lstDays)-1]
-        #     print strDate
-        #     strDate = strDate[:strDate.find(".")]
-        #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-        #     scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
-        #
-        # else:
-        scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
-        mydir = os.getcwd()
-        print mydir
-        os.chdir(scrapyDir+s['crawler'])
-        print media
-        print scrapycommand
-        os.system(scrapycommand)
-        fileSize = os.stat(filename).st_size
-        if fileSize <= 3: os.system("rm " + filename)
-        else:
-            os.chdir(scrapyDir)
-            os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
-            os.chdir(media)
-            mediaYears = os.listdir(".")
-            mediaYears.sort()
-            for yy in mediaYears:
-                os.chdir(yy)
-                try:
-                   os.makedirs(baseDir + media + "/" + yy)
-                except:
-                    pass
-                mediaDays = os.listdir(".")
-                mediaDays = [l for l in mediaDays if not l.startswith('.')]
-                mediaDays.sort()
-                for dd in mediaDays:
-                    os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-                os.chdir("..")
-                os.system("rm -R " + yy)
-            os.chdir("..")
-            os.system("rm -R " + media)
-            os.chdir(s['crawler'])
-            # os.system("rm " + filename)
-            os.chdir(mydir)
-        os.chdir("..")
-        os.chdir("..")
-# print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/scripts/tmp/tracker_by_day.py
+++ b/scripts/tmp/tracker_by_day.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-Descarga las noticias de un sitio desde entre dos fechas especificas para medios del tipo 'descarga_por_dia'.
-USO:
-tracker.py data.json
-"""
-import sys
-import json
-import os
-import datetime
-# today = datetime.datetime.now()
-baseDir = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        desde = datetime.datetime.strptime(s['desde'], "%d-%m-%Y")
-        hasta = datetime.datetime.strptime(s['hasta'], "%d-%m-%Y")
-        print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].rfind("/")+1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        # lstYears = os.listdir(".")
-        # lstYears.sort()
-        year = desde.year
-        # if len(lstYears) > 0:
-        #     year = int(lstYears[len(lstYears)-1])
-        for y in range(year, hasta.year+1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-#			print os.getcwd()
-            # lstDays = os.listdir(".")
-            # lstDays = [l for l in lstDays if not l.startswith('.')]
-            # lstDays.sort()
-            # print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde
-            # if len(lstDays) > 0:
-            #     strDate = lstDays[len(lstDays)-1]
-            #     strDate = strDate[:strDate.find(".")]
-            #     currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-            #     day = currentDate.timetuple().tm_yday
-            # elif y != desde.year:
-            if y != desde.year:
-                currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
-                day = 1
-            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1):
-                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir+s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3: os.system("rm " + filename)
-                else: os.system("mv " + filename + " " + mydir)
-                os.chdir(mydir)
-                currentDate = currentDate + datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-# print hasta.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
--- a/scripts/tmp/tracker_proceso.py
+++ b/scripts/tmp/tracker_proceso.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-"""
-    Script para el trackeo del medio PROCESO.
-"""
-import sys
-import os
-baseDir   = "/home/geoint/M3NAS/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-s = {"crawler": "descarga_por_mes/proceso"}
-media = s['crawler'][s['crawler'].rfind("/")+1:]
-os.chdir(baseDir)
-try:
-    os.makedirs(media)
-except:
-    print "ok"
-os.chdir(media)
-# rango va de 1976 a 2018
-for year in xrange(1976, 2019):
-    try:
-        os.makedirs(str(year))
-    except:
-        print "ok"
-    os.chdir(str(year))
-    filename = str(year) + ".json"
-    scrapycommand = "scrapy crawl noticias --nolog -s filename={0} -a year={1}".format(filename, str(year))
-    mydir = os.getcwd()
-    print mydir
-    os.chdir(scrapyDir + s['crawler'])
-    print media
-    print scrapycommand
-    os.system(scrapycommand)
-    fileSize = os.stat(filename).st_size
-    if fileSize <= 3: os.system("rm " + filename)
-    else:
-        os.chdir(scrapyDir)
-        sys_command = "python3 parse_date_files.py {0} {1}".format(s['crawler'], filename)
-        os.system(sys_command)
-        os.chdir(media)
-        mediaYears = os.listdir(".")
-        mediaYears.sort()
-        for yy in mediaYears:
-            os.chdir(yy)
-            try:
-                os.makedirs(baseDir + media + "/" + yy)
-            except:
-                pass
-            mediaDays = os.listdir(".")
-            mediaDays = [l for l in mediaDays if not l.startswith('.')]
-            mediaDays.sort()
-            for dd in mediaDays:
-                os.system("mv " + dd + " " + baseDir + media + "/" + yy)
-            os.chdir("..")
-            os.system("rm -R " + yy)
-        os.chdir("..")
-        os.system("rm -R " + media)
-        os.chdir(s['crawler'])
-        os.system("rm " + filename)
-        os.chdir(mydir)
-    os.chdir("..")
-    # os.chdir("..")
--- a/scripts/utf8Converter.py
+++ b/scripts/utf8Converter.py
-#!/usr/bin/python3
-import sys
-import os
-import json
-from pathlib import Path
-import chardet
-#from myModule import myModule
-#===============================================================================
-def ascii2utf8(inputfilename, outputfilename):
-	print(inputfilename)
-	with open(inputfilename) as json_file:
-		data = json.load(json_file)#.read().decode("unicode_escape")
-	print(data)
-	with open(outputfilename, 'w') as outfile:
-		json.dump(data, outfile, ensure_ascii=False, indent=1)
-#===============================================================================
-def copyDirStructure(indir, outdir):
-	print(indir)
-	path = Path(indir)
-	dirs = [e.name for e in path.iterdir() if e.is_dir()]
-	if not os.path.exists(outdir+path.name):
-		os.mkdir(outdir+path.name)
-	for d in dirs:
-		yeardir = outdir+path.name+"/"+d+"/"
-		print(path.name, d)
-		if not os.path.exists(yeardir):
-			os.mkdir(yeardir)
-		filepath = Path(indir+d)
-		files = [e.name for e in filepath.glob("*.json")]
-		for f in files:
-			ascii2utf8(indir+d+"/"+f, yeardir+f)
-#===============================================================================
-def main(argv):
-	if len(sys.argv) != 3:
-		print ("Usage: " + argv[0] + " <input dir> <output dir>")
-	else:
-		copyDirStructure(argv[1], argv[2])
-if __name__ == "__main__":
-	main(sys.argv)
--- a/spiders/daily/edoMexDia/edoMexDia/spiders/noticias.py
+++ b/spiders/daily/edoMexDia/edoMexDia/spiders/noticias.py
 """
 	Spider for edomexaldia.com
 	Author: Mario Chirinos Coluga
-	Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23 
+	Usage:scrapy crawl noticias --nolog -O 2021-09-12.json -a year=2021 -a month=9 -a day=12 
 """
 import scrapy, re