el sur

57f362cd · Mario Chirinos Colunga · eac90030 · 57f362cd · 57f362cd · 57f362cd
Commit 57f362cd authored Apr 15, 2020 by Mario Chirinos Colunga 💬
5 changed files
--- a/descarga_por_dia/elSur/elSur/items.py
+++ b/descarga_por_dia/elSur/elSur/items.py
--- a/descarga_por_dia/elSur/elSur/settings.py
+++ b/descarga_por_dia/elSur/elSur/settings.py
@@ -8,7 +8,7 @@
 #     https://doc.scrapy.org/en/latest/topics/settings.html
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+FEED_EXPORT_ENCODING = 'utf-8'
 BOT_NAME = 'elSur'
 SPIDER_MODULES = ['elSur.spiders']

--- a/descarga_por_dia/elSur/elSur/spiders/noticias.bk
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.bk
+"""
+MEDIA:
+    El Sur, Guerrero
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elSur/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from elSur.items import NoticiasItem
+#TAG_RE = re.compile(r'<[^>]+>')
+#HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+#HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+	"""
+	Basic Scrapy Spider class
+	"""
+	name = "noticias"
+	def start_requests(self):
+		year  = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day   = getattr(self, "day", None)
+		self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		print(self.baseURL)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+	def parse(self, response):
+		print(response)
+##		for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
+		for link in set(response.css('#post-list').css("li > div > div > a::attr(href)").extract():
+			print(link)
+			yield scrapy.Request(url=link, callback=self.parse_item)
+#		pag_lst = response.css('div.dslc-pagination > ul > li')
+#		if len(pag_lst) > 0:
+#			del pag_lst[0]
+#			del pag_lst[0]
+#			next_page = None
+#			for li_obj in pag_lst:
+#				li = remove_tags(li_obj.extract())
+#				if not li.isdigit():
+#					next_page = li_obj.xpath('./a/@href').extract_first()
+#					break
+#			    
+#		if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
+	def parse_item(self, response):
+		print(response)
+		item = NoticiasItem()
+#		text = ''
+#		news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+#		title = response.css('div.dslc-tp-title > h1').extract_first()
+#		if title is not None : title = remove_tags(title)
+#		topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
+#		if topic is not None : topic = remove_tags(topic)
+#		for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
+#			p = remove_tags(p)
+#			text += p + "\n"
+#		dateline = response.css('span.dateline').extract_first()
+#		if dateline is not None:
+#			dateline = remove_tags(dateline)
+#			text = text.replace(dateline, '')
+#		text = text.replace(u'\u00a0', ' ')
+#		text = HEAD_RE_1.sub('', text)
+#		text = HEAD_RE_2.sub('', text)
+#		## News item info ##
+#		item['date']  = news_date
+#		item['title'] = title
+#		item['topic'] = topic
+#		item['text']  = text.strip()
+#		item['url']   = response.url
+		yield item
--- a/descarga_por_dia/elSur/elSur/spiders/noticias.py
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.py
 # -*- coding: utf-8 -*-
 """
-MEDIA:
+	MEDIA:
 		El Sur, Guerrero
-USAGE:
+	USAGE:
 		## Get all the news from a specific date. ##
 		---------------------------------------------------------------------------------------------
 		$ cd elSur/
 		$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
 """
 import scrapy, re
 from elSur.items import NoticiasItem
+#from dateutil.parser import parse
+from dateparser import parse
-TAG_RE = re.compile(r'<[^>]+>')
+from datetime import datetime
-def remove_tags(text):
-    return TAG_RE.sub('', text)
-HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
-HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
 class QuotesSpider(scrapy.Spider):
 	"""
 		Basic Scrapy Spider class
@@ -35,62 +26,39 @@ class QuotesSpider(scrapy.Spider):
 		month = getattr(self, "month", None)
 		day   = getattr(self, "day", None)
-        baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		print(self.baseURL)
-        yield scrapy.Request(url=baseURL, callback=self.parse)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
 	def parse(self, response):
-        for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
+		print(response)
-            yield scrapy.Request(url=link, callback=self.parse_item)
-        pag_lst = response.css('div.dslc-pagination > ul > li')
-        if len(pag_lst) > 0:
-            del pag_lst[0]
-            del pag_lst[0]
-            next_page = None
-            for li_obj in pag_lst:
-                li = remove_tags(li_obj.extract())
-                if not li.isdigit():
-                    next_page = li_obj.xpath('./a/@href').extract_first()
-                    break
-            if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
+		for link in response.css('#post-list').css("li > div > div > a::attr(href)").extract():
+			print(link)
+			yield scrapy.Request(url=link, callback=self.parse_item)
+		if len(response.css(".paging-navigation > a::attr(href)").extract())>0:
+			next_page =  response.css(".paging-navigation > a::attr(href)").extract()[0]
+			print(next_page)
+			if next_page is not None: yield scrapy.Request(url=next_page, callback=self.parse)
 	def parse_item(self, response):
-        item = NoticiasItem()
+		print(response.encoding)
-        text = ''
+		date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		title = response.xpath('//meta[@property="og:title"]/@content').extract_first().lower()
-        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		topic = response.xpath('//meta[@property="article:section"]/@content').extract_first().lower()
+		text = ""
-        title = response.css('div.dslc-tp-title > h1').extract_first()
+		for p in response.css("div.xt-post-content > p::text").extract():
-        if title is not None : title = remove_tags(title)
+			text+= p.replace("\n", "")+"\n"
-        topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
+		item = NoticiasItem()
-        if topic is not None : topic = remove_tags(topic)
+		item['date'] =  datetime.fromtimestamp(int(date)).isoformat()
-        for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
-            p = remove_tags(p)
-            text += p + "\n"
-        dateline = response.css('span.dateline').extract_first()
-        if dateline is not None:
-            dateline = remove_tags(dateline)
-            text = text.replace(dateline, '')
-        text = text.replace(u'\u00a0', ' ')
-        text = HEAD_RE_1.sub('', text)
-        text = HEAD_RE_2.sub('', text)
-        ## News item info ##
-        item['date']  = news_date
 		item['title'] = title
 		item['topic'] = topic
-        item['text']  = text.strip()
+		item['text']  = text
 		item['url']   = response.url
+		if item["text"] !="":
+			print(item)
 			yield item
+		else:
+			yield
--- a/descarga_por_dia/elSur/out_test.json
+++ b/descarga_por_dia/elSur/out_test.json