Commit 5079a7b9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Ajustes.

parent 2cc23082
......@@ -7,10 +7,14 @@ uso:
import scrapy
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -20,6 +24,7 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class NoticiasSpider(scrapy.Spider):
name = "noticias"
......@@ -32,20 +37,21 @@ class NoticiasSpider(scrapy.Spider):
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/'
urls = [
self.baseURL+"opinion",
self.baseURL+"politica",
self.baseURL+"economia",
self.baseURL+"mundo",
self.baseURL+"estados",
self.baseURL+"capital",
self.baseURL+"sociedad",
self.baseURL+"ciencias",
self.baseURL+"cultura",
self.baseURL+"espectaculos",
self.baseURL+"deporte",
# self.baseURL+"politica",
# self.baseURL+"economia",
# self.baseURL+"mundo",
# self.baseURL+"estados",
# self.baseURL+"capital",
# self.baseURL+"sociedad",
# self.baseURL+"ciencias",
# self.baseURL+"cultura",
# self.baseURL+"espectaculos",
# self.baseURL+"deporte",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""parser principal."""
......@@ -54,6 +60,7 @@ class NoticiasSpider(scrapy.Spider):
url = self.baseURL + noticia.css('::attr(href)').extract_first()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
"""Parser para la pagina de cada noticia."""
......@@ -65,5 +72,3 @@ class NoticiasSpider(scrapy.Spider):
item['text'] = remove_tags( response.css('div.text').extract_first() )
item['topic'] = response.css('img.title::attr(title)').extract_first()
yield item
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment