Commit 5079a7b9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Ajustes.

parent 2cc23082
......@@ -7,56 +7,63 @@ uso:
import scrapy
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class NoticiasSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
"""Lista de url a explorar."""
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/'
urls = [
self.baseURL+"opinion",
self.baseURL+"politica",
self.baseURL+"economia",
self.baseURL+"mundo",
self.baseURL+"estados",
self.baseURL+"capital",
self.baseURL+"sociedad",
self.baseURL+"ciencias",
self.baseURL+"cultura",
self.baseURL+"espectaculos",
self.baseURL+"deporte",
# self.baseURL+"politica",
# self.baseURL+"economia",
# self.baseURL+"mundo",
# self.baseURL+"estados",
# self.baseURL+"capital",
# self.baseURL+"sociedad",
# self.baseURL+"ciencias",
# self.baseURL+"cultura",
# self.baseURL+"espectaculos",
# self.baseURL+"deporte",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""parser principal."""
"""parser principal."""
item = NoticiasItem()
for noticia in response.css('a.cabeza'):
url = self.baseURL + noticia.css('::attr(href)').extract_first()
url = self.baseURL + noticia.css('::attr(href)').extract_first()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
"""Parser para la pagina de cada noticia."""
"""Parser para la pagina de cada noticia."""
item = NoticiasItem()
item['title'] = response.css('div.cabeza::text').extract_first()
item['url'] = response.url
......@@ -64,6 +71,4 @@ class NoticiasSpider(scrapy.Spider):
item['location'] = response.css('p.s-s::text').extract_first()
item['text'] = remove_tags( response.css('div.text').extract_first() )
item['topic'] = response.css('img.title::attr(title)').extract_first()
yield item
yield item
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment