el comentario colima

eac90030 · Mario Chirinos · c6e78aaa · eac90030
Commit eac90030 authored Sep 20, 2019 by Mario Chirinos
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 38 deletions

noticias.py ...rga_por_dia/elComentario/elComentario/spiders/noticias.py +38 -38

No files found.
--- a/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
+++ b/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
@@ -22,55 +22,55 @@ def remove_tags(text):
 class QuotesSpider(scrapy.Spider):
-    """
+	"""
-    Basic Scrapy Spider class
+	Basic Scrapy Spider class
-    """
+	"""
-    name = "noticias"
+	name = "noticias"
-    def start_requests(self):
+	def start_requests(self):
-        year  = getattr(self, "year", None)
+		year  = getattr(self, "year", None)
-        month = getattr(self, "month", None)
+		month = getattr(self, "month", None)
-        day   = getattr(self, "day", None)
+		day   = getattr(self, "day", None)
-        baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
-        yield scrapy.Request(url=baseURL, callback=self.parse)
+		yield scrapy.Request(url=baseURL, callback=self.parse)
-    def parse(self, response):
-        for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract():
-            yield scrapy.Request(url=link, callback=self.parse_item)
-        next_page = response.css('div.post-pagination').xpath('./a[@title="Next page"]/@href').extract_first()
+	def parse(self, response):
-        if next_page is not None:
+		print("parse", response.url)
-            yield scrapy.Request(url=next_page, callback=self.parse)
+		pages = response.css("li.mkd-pagination-last-page").css("a::attr(href)").extract_first()
+		pages = int(pages[pages.find("page/")+5:-1])
+		for p in range (pages):
+			next_page = response.url+"/page/"+str(p+1)
+			yield scrapy.Request(url=next_page, callback=self.parse_page)
+	def parse_page(self, response):
+		print("parse_page", response.url)
+		links = response.css("h5").css("a::attr(href)").extract()
+		for link in links:
+			yield scrapy.Request(url=link, callback=self.parse_item)
-    def parse_item(self, response):
+	def parse_item(self, response):
-        item = NoticiasItem()
+		print("parse_item", response.url)
-        text = ''
+		item = NoticiasItem()
+		text = ''
-        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
+		item["topic"] = response.css("div.mkd-post-info-category").css("a::text").extract_first()
+		content = response.css("div.pf-content").css("p").extract()
-        title = response.xpath('//header/h1').extract_first()
+		for p in content:
-        if title is not None : title = remove_tags(title)
+			text+= remove_tags(p)+"\n"
-        topic = response.css('a.theme').extract_first()
+		text = text.strip()
-        if topic is not None : topic = remove_tags(topic)
-        for p in response.css('div.pf-content').css('p').extract():
+		item['text']  = text
-            p = remove_tags(p)
+		item['url']   = response.url
-            text += p + "\n"
+		print(item)
-        text = text.strip()
+		yield item
-        ## News item info ##
-        item['date']  = news_date
-        item['title'] = title
-        item['topic'] = topic
-        item['text']  = text
-        item['url']   = response.url
-        yield item