Commit 392e0d4e authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 77fa0911
...@@ -104,7 +104,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -104,7 +104,7 @@ class QuotesSpider(scrapy.Spider):
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]] item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
......
...@@ -108,7 +108,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -108,7 +108,7 @@ class QuotesSpider(scrapy.Spider):
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]] item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
......
...@@ -73,7 +73,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -73,7 +73,7 @@ class QuotesSpider(scrapy.Spider):
text = '' text = ''
item['date'] = self.date item['date'] = self.date
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first() item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//div[@class="clearfix"]/p').extract(): for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -43,7 +43,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,7 +43,6 @@ class QuotesSpider(scrapy.Spider):
## la fecha de la noticia ya incluye la zona horaria ## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first() item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
item['author'] = response.xpath('//span[@itemprop="name"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract(): for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -54,9 +54,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -54,9 +54,15 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de la noticia ya incluye la zona horaria ## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
item['date'] = d
item['title'] = response.css('h1.entry-title::text').extract_first() item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract(): for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
......
...@@ -54,7 +54,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -54,7 +54,7 @@ class QuotesSpider(scrapy.Spider):
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first() d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico) ## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
if d[-6:] != '-05:00' and d[-6:] != '-06:00' : if d[-6:] != '-06:00':
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
......
...@@ -71,7 +71,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -71,7 +71,6 @@ class QuotesSpider(scrapy.Spider):
item['date'] = self.date item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first() item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first() item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()
for paragraph in response.css('div.post-single-content').css('p').extract(): for paragraph in response.css('div.post-single-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
......
...@@ -65,8 +65,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -65,8 +65,7 @@ class QuotesSpider(scrapy.Spider):
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
item['author'] = response.css('div.td-post-author-name').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -65,7 +65,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -65,7 +65,7 @@ class QuotesSpider(scrapy.Spider):
item['date'] = d item['date'] = d
item['url'] = response.url item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -49,8 +49,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -49,8 +49,7 @@ class QuotesSpider(scrapy.Spider):
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['author'] = response.css('span.author').css('a::text').extract_first() item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first()
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
for paragraph in response.css('div.post-entry').css('p').extract(): for paragraph in response.css('div.post-entry').css('p').extract():
text += remove_tags(paragraph) text += remove_tags(paragraph)
......
...@@ -50,7 +50,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -50,7 +50,7 @@ class QuotesSpider(scrapy.Spider):
def parse_page(self, response): def parse_page(self, response):
for post in response.css('div.mosaicflow').css('div.post'): for post in response.css('div.mosaicflow').css('div.post'):
item = NoticiasItem() item = NoticiasItem()
item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract() item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
item['title'] = post.xpath('./h1/a/@title').extract_first() item['title'] = post.xpath('./h1/a/@title').extract_first()
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item) request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item request.meta['item'] = item
......
...@@ -3,6 +3,9 @@ import scrapy, re ...@@ -3,6 +3,9 @@ import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
Crawler descarga noticias desde el 2017.10.18
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -24,16 +27,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -24,16 +27,16 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1] pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
...@@ -48,25 +51,32 @@ class QuotesSpider(scrapy.Spider): ...@@ -48,25 +51,32 @@ class QuotesSpider(scrapy.Spider):
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.list-block').xpath('./h3/a/@href').extract(): for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first() title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
if title is None:
title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
item['title'] = title
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00': if d[-6:] != '-06:00':
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2] item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
for paragraph in response.css('div.post_content').css('p').extract(): paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
text += remove_tags(paragraph) + '\n' if len(paragraphs) <= 2:
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment