heraldo leon

3ea10cf1 · Mario Chirinos · 43fbb106 · 3ea10cf1 · 3ea10cf1 · 3ea10cf1
Commit 3ea10cf1 authored Dec 17, 2024 by Mario Chirinos
3 changed files
--- a/spiders/daily/adNoticas/adNoticas/spiders/noticias.py
+++ b/spiders/daily/adNoticas/adNoticas/spiders/noticias.py
@@ -20,12 +20,12 @@ class NoticiasSpider(scrapy.Spider):
 	start_urls = ['http://adnoticias.mx/']
 	#-----------------------------------------------------------------------
 	def start_requests(self):
-		year = getattr(self, "year", None)
+		self.year = getattr(self, "year", None)
-		month = getattr(self, "month", None)
+		self.month = getattr(self, "month", None)
-		day = getattr(self, "day", None)
+		self.day = getattr(self, "day", None)
-		self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
+		self.date = self.year + "-" + self.month.zfill(2) + "-" + self.day.zfill(2)
-		self.baseURL = "https://adnoticias.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
+		self.baseURL = "https://adnoticias.mx/" + self.year + "/" + self. month.zfill(2) + "/" + self.day.zfill(2) + "/"
 		yield scrapy.Request(url=self.baseURL, callback=self.parseList)
 	#-----------------------------------------------------------------------
 	def parseList(self, response):

--- a/spiders/daily/heraldoLeon/heraldoLeon/settings.py
+++ b/spiders/daily/heraldoLeon/heraldoLeon/settings.py
@@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING="utf-8"
 #USER_AGENT = 'heraldoLeon (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -25,7 +25,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -41,7 +41,10 @@ ROBOTSTXT_OBEY = True
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
 #}
+DEFAULT_REQUEST_HEADERS = {
+    # ... Other headers
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
+}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {

--- a/spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
+++ b/spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
@@ -16,15 +16,14 @@ def remove_tags(text):
 class NoticiasSpider(scrapy.Spider):
 	name = 'noticias'
 	allowed_domains = ['heraldoleon.mx']
-	start_urls = ['http://heraldoleon.mx/']
+	start_urls = ['https://heraldoleon.mx/']
 	def start_requests(self):
 		self.year = getattr(self, "year", None)
 		self.month = getattr(self, "month", None)
 		self.day = getattr(self, "day", None)
-		self.baseURL = "http://www.heraldoleon.mx/" + self.year + "/" + self.month + "/" + self.day
+		self.baseURL = "https://www.heraldoleon.mx/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2) + "/"
 		yield scrapy.Request(url=self.baseURL, callback=self.parse)
 	#-----------------------------------------------------------------------
 	def parse(self, response):
@@ -32,8 +31,8 @@ class NoticiasSpider(scrapy.Spider):
 		for link in response.xpath('//h3[@class="entry-title td-module-title"]/a/@href').extract():
 			yield scrapy.Request(url=link, callback=self.parse_item)
+		next_page = response.xpath('//div[contains(@class,"page-nav")]/a[@aria-label="next-page"]/@href').extract_first()
-		next_page = response.xpath('//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href').extract_first()
+#		next_page = response.xpath('//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href').extract_first()
 		print("nextPage", next_page)
 		if next_page is not None:
 			yield scrapy.Request(url=next_page, callback=self.parse)
@@ -42,18 +41,18 @@ class NoticiasSpider(scrapy.Spider):
 #		print(response.url)
 		item = HeraldoleonItem()
 		item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
-		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
+#		item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
+		item['title'] = response.xpath('//h1[@class="tdb-title-text"]/text()').extract_first()
 		text=""
-		for p in response.xpath('//div[@class="td-post-content"]/p/text()').extract():
+		for p in response.xpath('//div[contains(@class, "td-post-content")]/div/p').extract():
-			nt = remove_tags(p).replace("\n","").replace("\r","").strip()
+			tt  = remove_tags(p)+ "\n"
-			text+=nt
+			text+=tt
-			if len(nt)>0:
-				text+="\n"
 		item['text'] = text.strip()
-		item['topic'] = ", ".join(response.xpath('//ul[@class="td-tags td-post-small-box clearfix"]/li/a/text()').extract())
+		item['topic'] = response.xpath('//ul[@class="tdb-tags"]/li/a/text()').extract()
 		item['url'] = response.url
-		item["author"]=", ".join(response.xpath('//div[@class="td-post-source-via "]/div/a/text()').extract())
+		item["author"]=", ".join(response.xpath('//div[contains(@class, "tdb_single_via ")]/div/a/text()').extract())
-		item["location"]=""
+#		item["location"]=""
-		print(self.allowed_domains, item["title"])
+		print(item["title"])
 		yield item