Commit 7d116982 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 81ee8b42
import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
'''
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('h1.title::text').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first()
for paragraph in response.css('div.post-entry').css('p').extract():
text += remove_tags(paragraph)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('h1.title::text').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for p in response.css('div.post-entry').css('p').extract():
text += remove_tags(p)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment