modificacion el comentario

9cf59fb3 · Mario Chirinos · 673e21e0 · 9cf59fb3
Commit 9cf59fb3 authored May 23, 2023 by Mario Chirinos
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 13 deletions

noticias.py spiders/daily/elComentario/elComentario/spiders/noticias.py +13 -13

No files found.
--- a/spiders/daily/elComentario/elComentario/spiders/noticias.py
+++ b/spiders/daily/elComentario/elComentario/spiders/noticias.py
@@ -12,11 +12,11 @@ class NoticiasSpider(scrapy.Spider):
 	start_urls = ['http://elcomentario.ucol.mx/']
 	def start_requests(self):
-		year  = getattr(self, "year", None)
+		self.year  = getattr(self, "year", None)
-		month = getattr(self, "month", None)
+		self.month = getattr(self, "month", None)
-		day   = getattr(self, "day", None)
+		self.day   = getattr(self, "day", None)
-		baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
 		yield scrapy.Request(url=baseURL, callback=self.parse)
@@ -24,26 +24,26 @@ class NoticiasSpider(scrapy.Spider):
 	def parse(self, response):
 		print(response.url)
-		for link in response.xpath('//h5[@class="mkd-pt-six-title"]/a/@href').extract():
+		for link in response.xpath('//h2[@class="thumb-title"]/a/@href').extract():
-			yield scrapy.Request(url=link, callback=self.parse_item)
+			yield scrapy.Request(url="https://elcomentario.ucol.mx"+link, callback=self.parse_item)
-		next_page = response.xpath('//li[@class="mkd-pagination-next"]/a/@href').extract_first()
+		next_page = response.xpath('//li[@class="the-next-page"]/a/@href').extract_first()
 		print("next_page", next_page)
 		if next_page is not None:
 			yield scrapy.Request(url=next_page, callback=self.parse)
 	#-----------------------------------------------------------------------
 	def parse_item(self, response):
-		print(response.url)
+#		print(response.url)
 		item = ElcomentarioItem()
-		item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
+		item["date"] = self.year+"-"+self.month.zfill(2)+"-"+self.day.zfill(2)
-		item["title"] =  response.xpath("//meta[@property='og:title']/@content").extract_first()
+		item["title"] =  response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
-		item["topic"] = ""
+		item["topic"] = response.xpath("//div[@class='entry-header']/span/a/text()").extract_first()
 		text=""
-		for p in response.xpath('//div[@class="pf-content"]/p').extract():
+		for p in response.xpath('//div[@class="entry-content entry clearfix"]/p').extract():
 			text += remove_tags(p) + "\n"
 		item["text"]=text
 		item["url"]=response.url
-		print(item["title"])
+		print(self.allowed_domains,item["title"])
 		yield(item)