Commit 5079a7b9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Ajustes.

parent 2cc23082
...@@ -7,10 +7,14 @@ uso: ...@@ -7,10 +7,14 @@ uso:
import scrapy import scrapy
import re import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -20,6 +24,7 @@ class NoticiasItem(scrapy.Item): ...@@ -20,6 +24,7 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class NoticiasSpider(scrapy.Spider): class NoticiasSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -32,20 +37,21 @@ class NoticiasSpider(scrapy.Spider): ...@@ -32,20 +37,21 @@ class NoticiasSpider(scrapy.Spider):
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/' self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/'
urls = [ urls = [
self.baseURL+"opinion", self.baseURL+"opinion",
self.baseURL+"politica", # self.baseURL+"politica",
self.baseURL+"economia", # self.baseURL+"economia",
self.baseURL+"mundo", # self.baseURL+"mundo",
self.baseURL+"estados", # self.baseURL+"estados",
self.baseURL+"capital", # self.baseURL+"capital",
self.baseURL+"sociedad", # self.baseURL+"sociedad",
self.baseURL+"ciencias", # self.baseURL+"ciencias",
self.baseURL+"cultura", # self.baseURL+"cultura",
self.baseURL+"espectaculos", # self.baseURL+"espectaculos",
self.baseURL+"deporte", # self.baseURL+"deporte",
] ]
for url in urls: for url in urls:
yield scrapy.Request(url=url, callback=self.parse) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
"""parser principal.""" """parser principal."""
...@@ -54,6 +60,7 @@ class NoticiasSpider(scrapy.Spider): ...@@ -54,6 +60,7 @@ class NoticiasSpider(scrapy.Spider):
url = self.baseURL + noticia.css('::attr(href)').extract_first() url = self.baseURL + noticia.css('::attr(href)').extract_first()
yield scrapy.Request(url, callback=self.parse_dir_contents) yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response): def parse_dir_contents(self, response):
"""Parser para la pagina de cada noticia.""" """Parser para la pagina de cada noticia."""
...@@ -65,5 +72,3 @@ class NoticiasSpider(scrapy.Spider): ...@@ -65,5 +72,3 @@ class NoticiasSpider(scrapy.Spider):
item['text'] = remove_tags( response.css('div.text').extract_first() ) item['text'] = remove_tags( response.css('div.text').extract_first() )
item['topic'] = response.css('img.title::attr(title)').extract_first() item['topic'] = response.css('img.title::attr(title)').extract_first()
yield item yield item
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment