Commit bf20a7d5 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 19792a13
...@@ -43,8 +43,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,8 +43,6 @@ class QuotesSpider(scrapy.Spider):
self.stopDate = None self.stopDate = None
baseURL = "http://www.latribuna.hn/" baseURL = "http://www.latribuna.hn/"
# self.baseURI = self.baseURL + "/ajax/get_section_news.html?viewmore=%2Fajax%2Fget_section_news.html&page="
# self.sectionURI = "&size=6&section="
sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa", sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
"ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques", "ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
...@@ -55,14 +53,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -55,14 +53,13 @@ class QuotesSpider(scrapy.Spider):
# sectionList = ["noticias"] # sectionList = ["noticias"]
for s in sectionList: for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse) yield scrapy.Request(url=baseURL + s, callback=self.parse)
def parse(self, response): def parse(self, response):
# searchData = ImportantData()
CONTINUE_SEARCHING = True CONTINUE_SEARCHING = True
linkList = response.xpath('//section[@class="section-67"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract() linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
linkList.extend(response.xpath('//section[@class="section-67"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract()) linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
if self.stopDate is None: if self.stopDate is None:
for link in linkList: for link in linkList:
...@@ -87,64 +84,21 @@ class QuotesSpider(scrapy.Spider): ...@@ -87,64 +84,21 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=nextPage, callback=self.parse) yield scrapy.Request(url=nextPage, callback=self.parse)
# def continue_searching(self, response):
# searchData = response.meta['item']
# CONTINUE_SEARCHING = True
#
# linkList = response.xpath('//article/div/h1/a/@href').extract()
#
# if len(linkList) > 0:
# if self.stopDate is None:
# for link in linkList:
# yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
#
# else:
# for link in linkList:
# res = DAT_RE.search(link)
# if res:
# dat = res.group(0).replace("-", '')
# newsDate = date(int(dat[:4]), int(dat[4:6]), int(dat[6:]))
# if newsDate >= self.stopDate:
# yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
#
# else:
# CONTINUE_SEARCHING = False
# break
#
# else:
# CONTINUE_SEARCHING = False
#
# if CONTINUE_SEARCHING:
# searchData['page'] += 1
# page = searchData['page']
# section = searchData['section']
# url = self.baseURI + str(page) + self.sectionURI + section
# request = scrapy.Request(url=url, callback=self.continue_searching)
# request.meta['item'] = searchData
# yield request
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
"La fecha obtenida ya incluye formato y zona horaria" "La fecha obtenida ya incluye formato y zona horaria"
newsData = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first()) item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
newsDict = json.loads(newsData) item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())
item['date'] = newsDict['datePublished']
item['title'] = newsDict['headline']
try: try:
topic = response.xpath('//div[@class="news-line"]/a/text()').extract()[-1] topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
except: except:
topic = None topic = None
item['topic'] = topic item['topic'] = topic
author = response.css('div.content-author').xpath('./p/meta[@itemprop="name"]/@content').extract_first() for p in response.css('div.article-post-content').css('p').extract():
if author is not None:
item['author'] = author
for p in response.css('div.news-body').css('p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
item['text'] = text.strip() item['text'] = text.strip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment