Commit d895ebe3 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 3885bd5c
import scrapy, re import scrapy, re
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.page-nav').css('a.last::attr(href)').extract() pagination = response.css('div.page-nav').css('a.last::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[0].strip('/') pagination = pagination[0].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(0,pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract(): for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first() item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first()
d = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first() d = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00': if d[-6:] != '-06:00':
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['topic'] = response.css('div.td-post-header').css('a::text').extract_first() item['topic'] = response.css('div.td-post-header').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy, re import scrapy, re
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6 # zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria # nombre de la zona horaria
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract() pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
item = NoticiasItem() item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'): for post in response.xpath('//ul[@class="archivepost"]/li'):
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) # item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T') item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract() item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item) request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item request.meta['item'] = item
yield request yield request
def parse_item(self, response): def parse_item(self, response):
text = '' text = ''
item = response.meta['item'] item = response.meta['item']
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first() item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract(): for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
...@@ -6,81 +6,81 @@ from datetime import datetime, timedelta, tzinfo ...@@ -6,81 +6,81 @@ from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para sonora (tiempo del pacifico): utc-7 # zona horaria para sonora (tiempo del pacifico): utc-7
return timedelta(hours=-7) return timedelta(hours=-7)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria # nombre de la zona horaria
return 'UTC-7' return 'UTC-7'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
tz = UTC() tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T') self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T')
self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract() pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(0,pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//h2[@class="entry-title"]/a/@href').extract(): for link in response.xpath('//h2[@class="entry-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = self.date item['date'] = self.date
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first() item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//div[@class="clearfix"]/p').extract(): for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
...@@ -6,16 +6,16 @@ scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 - ...@@ -6,16 +6,16 @@ scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
# class QuotesSpider(scrapy.Spider): # class QuotesSpider(scrapy.Spider):
# name = "noticias" # name = "noticias"
...@@ -65,66 +65,66 @@ class NoticiasItem(scrapy.Item): ...@@ -65,66 +65,66 @@ class NoticiasItem(scrapy.Item):
# yield item # yield item
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
# handle_httpstatus_list = [404] # handle_httpstatus_list = [404]
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
# while not self.stop: # while not self.stop:
# # for page in range(0, 50): # # for page in range(0, 50):
# if page == 0: # if page == 0:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http) # yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
# elif page > 0: # elif page > 0:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http) # yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1 # page += 1
def parse(self, response): def parse(self, response):
# if response.status == 404: # if response.status == 404:
# print('**********hey, 404! TRUE!!!') # print('**********hey, 404! TRUE!!!')
# self.stop = True # self.stop = True
# else: # else:
linkList = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract() linkList = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
linkList.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract()) linkList.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract())
linkList.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract()) linkList.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract())
for link in linkList: for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.xpath('//*[@class="pagination"]/a/@href').extract()[-1] nextPage = response.xpath('//*[@class="pagination"]/a/@href').extract()[-1]
yield scrapy.Request(url=nextPage, callback=self.parse) yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if d is None or d == '': if d is None or d == '':
d = response.xpath('//meta[@property="DC.date.issued"]/@content').extract_first() d = response.xpath('//meta[@property="DC.date.issued"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00': if d[-6:] != '-06:00':
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first() item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
paragraph = response.xpath('//*[@class="post_content_wrapper"]/p').extract() paragraph = response.xpath('//*[@class="post_content_wrapper"]/p').extract()
paragraph.extend(response.xpath('//*[@title="Page 1"]/div/p').extract()) paragraph.extend(response.xpath('//*[@title="Page 1"]/div/p').extract())
paragraph.extend(response.xpath('//*[@class="text_exposed_root text_exposed"]/p').extract()) paragraph.extend(response.xpath('//*[@class="text_exposed_root text_exposed"]/p').extract())
for p in paragraph: for p in paragraph:
text += remove_tags(p) + '\n' text += remove_tags(p) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment