Commit 7d116982 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 81ee8b42
import scrapy, re import scrapy, re
'''
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
'''
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -21,11 +21,12 @@ class NoticiasItem(scrapy.Item): ...@@ -21,11 +21,12 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
...@@ -34,8 +35,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -34,8 +35,8 @@ class QuotesSpider(scrapy.Spider):
for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract(): for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first() nextPage = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
yield scrapy.Request(url=next_page, callback=self.parse) yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response): def parse_item(self, response):
...@@ -49,12 +50,14 @@ class QuotesSpider(scrapy.Spider): ...@@ -49,12 +50,14 @@ class QuotesSpider(scrapy.Spider):
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.css('div.post-entry').css('p').extract(): for p in response.css('div.post-entry').css('p').extract():
text += remove_tags(paragraph) text += remove_tags(p)
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment