el valle

9166c6e7 · Mario Chirinos · 42ffe60e · 9166c6e7
Commit 9166c6e7 authored Dec 17, 2024 by Mario Chirinos
Show whitespace changes
Inline Side-by-side

Showing with 61 additions and 0 deletions

noticias.py spiders/daily/elValle/elValle/spiders/noticias.py +61 -0

No files found.
--- a/spiders/daily/elValle/elValle/spiders/noticias.py
+++ b/spiders/daily/elValle/elValle/spiders/noticias.py
+"""
+	Spider for elvalle.com.mx
+	Author: Mario Chirinos Coluga
+	Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23 
+"""
+import scrapy
+import re
+from elValle.items import ElvalleItem
+#-------------------------------------------------------------------------------
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+#-------------------------------------------------------------------------------
+class NoticiasSpider(scrapy.Spider):
+	name = 'noticias'
+	allowed_domains = ['elvalle.com.mx']
+	start_urls = ['https://elvalle.com.mx/']
+	#-----------------------------------------------------------------------
+	def start_requests(self):
+		year = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day = getattr(self, "day", None)
+		self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
+		self.baseURL = "https://elvalle.com.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
+		yield scrapy.Request(url=self.baseURL, callback=self.parseList)
+	#-----------------------------------------------------------------------
+	def parseList(self, response):
+		print(response.url)
+		for i in response.xpath('//article//div[@class="post-details"]/a/@href').extract():
+			yield scrapy.Request(url=i, callback=self.parseItem)
+		next_page = response.xpath('//ul[@class="pagination clearfix"]/li[last()]/a/@href').extract_first()
+		print("nextPage", next_page)
+		if next_page is not None:
+			yield scrapy.Request(url=next_page, callback=self.parseList)
+	#-----------------------------------------------------------------------
+	def parseItem(self, response):
+#		print(response.url)
+		item = ElvalleItem()
+		text = ''
+		for p in response.xpath('//article/p/text()').extract():
+			tt  = remove_tags(p)+ "\n"
+			text+=tt
+		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
+		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
+		item['author'] = response.xpath('//meta[@name="author"]/@content').extract_first()
+		item['text']=text
+		item['url']= response.xpath('//link[@rel="canonical"]/@href').extract_first()
+		print(item['title'])
+		yield(item)