Commit eac90030 authored by Mario Chirinos's avatar Mario Chirinos

el comentario colima

parent c6e78aaa
...@@ -22,55 +22,55 @@ def remove_tags(text): ...@@ -22,55 +22,55 @@ def remove_tags(text):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
""" """
Basic Scrapy Spider class Basic Scrapy Spider class
""" """
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) year = getattr(self, "year", None)
month = getattr(self, "month", None) month = getattr(self, "month", None)
day = getattr(self, "day", None) day = getattr(self, "day", None)
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2)) baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.post-pagination').xpath('./a[@title="Next page"]/@href').extract_first() def parse(self, response):
if next_page is not None: print("parse", response.url)
yield scrapy.Request(url=next_page, callback=self.parse)
pages = response.css("li.mkd-pagination-last-page").css("a::attr(href)").extract_first()
pages = int(pages[pages.find("page/")+5:-1])
for p in range (pages):
next_page = response.url+"/page/"+str(p+1)
yield scrapy.Request(url=next_page, callback=self.parse_page)
def parse_page(self, response):
print("parse_page", response.url)
links = response.css("h5").css("a::attr(href)").extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() print("parse_item", response.url)
text = '' item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = response.css("div.mkd-post-info-category").css("a::text").extract_first()
content = response.css("div.pf-content").css("p").extract()
title = response.xpath('//header/h1').extract_first() for p in content:
if title is not None : title = remove_tags(title) text+= remove_tags(p)+"\n"
topic = response.css('a.theme').extract_first() text = text.strip()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.pf-content').css('p').extract(): item['text'] = text
p = remove_tags(p) item['url'] = response.url
text += p + "\n" print(item)
text = text.strip() yield item
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment