Commit f1dfa7e9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 720f6059
......@@ -211,6 +211,16 @@ Se incluyen los siguientes medios:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
```
No se encontró notas antes del 2011.11.28.
* [Proceso](http://www.proceso.com.mx/)
Uso:
```bash
cd proceso
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3
```
No se encontró notas antes de Nov. 1976.
* [Punto Medio](https://www.puntomedio.mx/)
......
[
{"nombre": "Al Chile", "crawler": "sitios_yucatan/alChile", "desde": "01-06-2014", "url": "http://alchile.com.mx/"},
{"nombre": "Desde el Balcón", "crawler": "sitios_yucatan/desdeElBalcon", "desde": "11-01-2014", "url": "http://www.desdeelbalcon.com/"},
{"nombre": "Diario de Yucatán", "crawler": "sitios_yucatan/diarioYucatan", "desde": "02-04-2012", "url": "http://yucatan.com.mx/"},
{"nombre": "El Grillo", "crawler": "sitios_yucatan/grilloPorteno", "desde": "04-11-2013", "url": "http://grilloporteno.com/"},
{"nombre": "La Jornada Maya", "crawler": "sitios_yucatan/laJornadaMaya", "desde": "12-03-2015", "url": "https://www.lajornadamaya.mx/"},
{"nombre": "La Verdad Yucatán", "crawler": "sitios_yucatan/laVerdadYuc", "desde": "01-12-2015", "url": "http://laverdadnoticias.com/"},
{"nombre": "Lector MX", "crawler": "sitios_yucatan/lectorMX", "desde": "23-10-2015", "url": "http://lectormx.com/"},
{"nombre": "Mi Punto de Vista", "crawler": "sitios_yucatan/miPuntoDeVista", "desde": "04-10-2008", "url": "http://www.mipuntodevista.com.mx/"},
{"nombre": "Notirivas", "crawler": "sitios_yucatan/notirivas", "desde": "22-11-2016", "url": "http://gruporivas.com.mx/notirivas/"},
{"nombre": "Notisureste", "crawler": "sitios_yucatan/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com/"},
{"nombre": "Punto Medio", "crawler": "sitios_yucatan/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx/"},
{"nombre": "Sona 89.3", "crawler": "sitios_yucatan/sona893", "desde": "09-04-2012", "url": "http://sona893.fm/"},
{"nombre": "Yucatán a la Mano", "crawler": "sitios_yucatan/yucatanALaMano", "desde": "11-05-2015", "url": "http://www.yucatanalamano.com/"},
{"nombre": "Yucatán al Minuto", "crawler": "sitios_yucatan/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com/"},
{"nombre": "Yucatán en Corto", "crawler": "sitios_yucatan/yucatanEnCorto", "desde": "02-04-2011", "url": "http://florcastillo.mx/noticias/"},
{"nombre": "Diario del Yaqui", "crawler": "otros_sitios/diarioYaqui", "desde": "30-12-2016", "url": "http://diariodelyaqui.mx"},
{"nombre": "La Jornada", "crawler": "otros_sitios/laJornada", "desde": "01-02-2005", "url": "http://www.jornada.unam.mx"},
{"nombre": "La Jornada Aguascalientes", "crawler": "otros_sitios/laJornadaAgs", "desde": "01-12-2008", "url": "http://www.lja.mx/"},
{"nombre": "La Jornada Baja California", "crawler": "otros_sitios/laJornadaBC", "desde": "09-02-2015", "url": "http://jornadabc.mx"},
{"nombre": "La Jornada Guerrero", "crawler": "otros_sitios/laJornadaGro", "desde": "22-01-2007", "url": "http://www.lajornadaguerrero.com.mx"},
{"nombre": "La Jornada de Oriente", "crawler": "otros_sitios/laJornadaOte", "desde": "01-06-2013", "url": "http://www.lajornadadeoriente.com.mx/"},
{"nombre": "La Jornada San Luis", "crawler": "otros_sitios/laJornadaSanLuis", "desde": "08-10-2014", "url": "http://lajornadasanluis.com.mx"},
{"nombre": "La Jornada Veracruz", "crawler": "otros_sitios/laJornadaVer", "desde": "11-05-2009", "url": "http://www.jornadaveracruz.com.mx"},
{"nombre": "La Jornada Zacatecas", "crawler": "otros_sitios/laJornadaZac", "desde": "10-06-2013", "url": "http://ljz.mx"}
{"nombre": "Al Chile", "crawler": "descarga_por_fecha/alChile", "desde": "01-06-2014", "url": "http://alchile.com.mx/"},
{"nombre": "Desde el Balcón", "crawler": "descarga_por_fecha/desdeElBalcon", "desde": "11-01-2014", "url": "http://www.desdeelbalcon.com/"},
{"nombre": "Diario del Yaqui", "crawler": "descarga_por_fecha/diarioYaqui", "desde": "30-12-2016", "url": "http://diariodelyaqui.mx"},
{"nombre": "Diario de Yucatán", "crawler": "descarga_por_fecha/diarioYucatan", "desde": "02-04-2012", "url": "http://yucatan.com.mx/"},
{"nombre": "El Grillo", "crawler": "descarga_por_fecha/grilloPorteno", "desde": "04-11-2013", "url": "http://grilloporteno.com/"},
{"nombre": "La Jornada", "crawler": "descarga_por_fecha/laJornada", "desde": "01-02-2005", "url": "http://www.jornada.unam.mx"},
{"nombre": "La Jornada Aguascalientes", "crawler": "descarga_por_fecha/laJornadaAgs", "desde": "01-12-2008", "url": "http://www.lja.mx/"},
{"nombre": "La Jornada Baja California", "crawler": "descarga_por_fecha/laJornadaBC", "desde": "09-02-2015", "url": "http://jornadabc.mx"},
{"nombre": "La Jornada Guerrero", "crawler": "descarga_por_fecha/laJornadaGro", "desde": "22-01-2007", "url": "http://www.lajornadaguerrero.com.mx"},
{"nombre": "La Jornada Maya", "crawler": "descarga_por_fecha/laJornadaMaya", "desde": "12-03-2015", "url": "https://www.lajornadamaya.mx/"},
{"nombre": "La Jornada de Oriente", "crawler": "descarga_por_fecha/laJornadaOte", "desde": "01-06-2013", "url": "http://www.lajornadadeoriente.com.mx/"},
{"nombre": "La Jornada San Luis", "crawler": "descarga_por_fecha/laJornadaSanLuis", "desde": "08-10-2014", "url": "http://lajornadasanluis.com.mx"},
{"nombre": "La Jornada Veracruz", "crawler": "descarga_por_fecha/laJornadaVer", "desde": "11-05-2009", "url": "http://www.jornadaveracruz.com.mx"},
{"nombre": "La Jornada Zacatecas", "crawler": "descarga_por_fecha/laJornadaZac", "desde": "10-06-2013", "url": "http://ljz.mx"},
{"nombre": "La Verdad Yucatán", "crawler": "descarga_por_fecha/laVerdadYuc", "desde": "01-12-2015", "url": "http://laverdadnoticias.com/"},
{"nombre": "Lector MX", "crawler": "descarga_por_fecha/lectorMX", "desde": "23-10-2015", "url": "http://lectormx.com/"},
{"nombre": "Mi Punto de Vista", "crawler": "descarga_por_fecha/miPuntoDeVista", "desde": "04-10-2008", "url": "http://www.mipuntodevista.com.mx/"},
{"nombre": "Notirivas", "crawler": "descarga_por_fecha/notirivas", "desde": "22-11-2016", "url": "http://gruporivas.com.mx/notirivas/"},
{"nombre": "Notisureste", "crawler": "descarga_por_fecha/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com/"},
{"nombre": "Proceso", "crawler": "descarga_por_mes/proceso", "desde": "11-1976", "url": "http://www.proceso.com.mx/"},
{"nombre": "Punto Medio", "crawler": "descarga_por_fecha/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx/"},
{"nombre": "Sona 89.3", "crawler": "descarga_por_fecha/sona893", "desde": "09-04-2012", "url": "http://sona893.fm/"},
{"nombre": "Yucatán a la Mano", "crawler": "descarga_por_fecha/yucatanALaMano", "desde": "11-05-2015", "url": "http://www.yucatanalamano.com/"},
{"nombre": "Yucatán al Minuto", "crawler": "descarga_por_fecha/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com/"},
{"nombre": "Yucatán en Corto", "crawler": "descarga_por_fecha/yucatanEnCorto", "desde": "02-04-2011", "url": "http://florcastillo.mx/noticias/"}
]
\ No newline at end of file
......@@ -69,8 +69,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
......@@ -8,17 +8,17 @@ import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
......@@ -49,10 +49,10 @@ class QuotesSpider(scrapy.Spider):
section = response.url[response.url.rfind('/')+1:]
if ( section == 'opinion' ): # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href']
'//*[@id="opinion"]/p/a/@href']
else:
path_list = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
'//*[@id="article_list"]/h3/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment