ljz date

e9188630 · Mario Chirinos · 39b868fa · e9188630
Commit e9188630 authored Jul 12, 2023 by Mario Chirinos
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 7 deletions

noticias.py spiders/daily/laJornadaZac/laJornadaZac/spiders/noticias.py +7 -7

No files found.
--- a/spiders/daily/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/spiders/daily/laJornadaZac/laJornadaZac/spiders/noticias.py
@@ -18,12 +18,12 @@ class NoticiasSpider(scrapy.Spider):
 	start_urls = ['http://ljz.mx/']
 	#-----------------------------------------------------------------------
 	def start_requests(self):
-		year = getattr(self, "year", None)
-		month = getattr(self, "month", None)
-		day = getattr(self, "day", None)
+		self.year = getattr(self, "year", None)
+		self.month = getattr(self, "month", None)
+		self.day = getattr(self, "day", None)

-		self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
-		self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + month.zfill(2) + "/" + year + "/"
+		self.date = self.year + "-" + self.month.zfill(2) + "-" + self.day.zfill(2)
+		self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + self.month.zfill(2) + "/" + self.year + "/"
 		yield scrapy.Request(url=self.baseURL, callback=self.parse)
 		
 	def parse(self, response):
@@ -40,13 +40,13 @@ class NoticiasSpider(scrapy.Spider):
 	def parse_item(self, response):
 #		print(response.url)
 		item = LajornadazacItem()
-		item["date"] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item["date"] = self.year+"-"+self.month+"-"+self.day
 		item["title"] = response.xpath('//meta[@itemprop="headline"]/@content').extract_first()
 		item["topic"] = response.xpath('//meta[@property="article:section"]/@content').extract_first()
 		item["author"] = response.xpath('//p[@class="ljz_coauthors"]/i//text()').extract_first()
 		text=""
 		for p in response.xpath('//p[@class="ljz_coauthors"]/following-sibling::div//p').extract():
-			text += remove_tags(p) + "\n"
+			text += remove_tags(p) + "\n "
 		item["text"]=text
 		item["url"]=response.url
 		print(self.allowed_domains,item["title"])