crawlers update

4149fd45 · Renán Sosa Guillen · c69e5696 · 4149fd45 · 4149fd45 · 4149fd45
Commit 4149fd45 authored Aug 11, 2017 by Renán Sosa Guillen
20 changed files
--- a/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.py
+++ b/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.py
@@ -44,7 +44,7 @@ class QuotesSpider(scrapy.Spider):
 		
 		for section in section_list:
 			self.section = section
-			print self.section
+			# print self.section
 			self.page = 0
 			self.count = 0
 			self.found = False
@@ -61,7 +61,7 @@ class QuotesSpider(scrapy.Spider):
 					self.next_section = True
 					break
 				page += 1
-				print 'page '+str(page)
+				# print 'page '+str(page)
 				yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse, dont_filter=True)
 			
 			# if not ( self.next_section ):
@@ -80,7 +80,7 @@ class QuotesSpider(scrapy.Spider):

 	
 	def parse(self, response):
-		print response.url
+		# print response.url
 		count = 0
 		this_page = int(response.url[response.url.rfind('=')+1:])

@@ -94,10 +94,10 @@ class QuotesSpider(scrapy.Spider):
 		if ( len(link_list) > 0 ):
 			if ( this_page > 0 ):
 				del link_list[0]
-			print link_list
+			# print link_list
 			for link in link_list:
 				link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
-				print self.section+' '+link_date.isoformat()+' ? '+self.date.isoformat()
+				# print self.section+' '+link_date.isoformat()+' ? '+self.date.isoformat()
 				if ( link_date.month == self.date.month and link_date.year == self.date.year ):
 					# self.page = int(response.url[response.url.rfind('=')+1:])
 					# self.count += 1

--- a/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.pyc
+++ b/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.pyc
--- a/otros_sitios/laJornadaMaya/laJornadaMaya/spiders/noticias.py
+++ b/otros_sitios/laJornadaMaya/laJornadaMaya/spiders/noticias.py
+import scrapy
+import json
+from datetime import datetime, date, timedelta
+
+#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+
+import re
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class NoticiasItem(scrapy.Item):
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
+
+
+class QuotesSpider(scrapy.Spider):
+	name = "noticias"
+	def start_requests(self):
+		# self.found = False
+		# self.flag = False
+		self.year = getattr(self, 'year', None)
+		self.month = getattr(self, 'month', None)
+		self.day = getattr(self, 'day', None)
+		self.req_date = date(int(self.year), int(self.month), int(self.day))
+		self.date_format = "%Y-%m-%d"
+		self.baseURL = 'https://www.lajornadamaya.mx'
+		section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
+						'internacional', 'opinion']
+		# section_list = ['deportes']
+
+		for section in section_list:
+			self.section = section
+			for count in range(0,2):
+				if ( count == 0 ):
+					yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
+				elif (count == 1):
+					# self.section = section
+					self.page = 0
+					self.flag = False
+					self.found = False
+					page = -1
+					if not ( section == 'opinion' ):
+						while True:
+							if ( self.flag ):
+								self.flag = False
+								break
+							page+=1
+							yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
+						
+						if ( self.found ):
+							self.found = False
+							self.page -= 1
+							if ( self.page > 0 ):
+								self.page -= 1
+							
+							for pag in range(self.page, self.page+6):
+								yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
+
+					else:
+						yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
+				
+	def parse_2(self, response): # para las primeras noticias
+		path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
+		link_list = []
+		for path in path_list:
+			link_list += response.xpath(path).extract()
+		
+		for link in link_list:
+			if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
+				item = NoticiasItem()
+				item['date'] = link[:link.rfind('/')]
+				item['topic'] = response.url[response.url.rfind('/')+1:].title()
+				# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request.meta['item'] = item
+				yield request
+
+
+	def parse(self, response): # para los json
+		json_response = json.loads(response.text)
+
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+		
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+
+			if ( this_date == self.req_date ):
+				self.page = int(response.url[response.url.rfind('=')+1:])
+				self.found = True
+				self.flag = True
+				break
+			elif ( this_date < self.req_date ):
+				self.flag = True
+				break
+				
+	def parse_item_2(self, response): # para las primeras noticias
+		item = response.meta['item']
+		# item = NoticiasItem()
+		text = ''
+		# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
+		# item['topic'] = self.section.title()
+		item['title'] = response.xpath('//article/h1/text()').extract_first()
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
+
+
+	def parse_page(self, response): # para los json
+		json_response = json.loads(response.text)
+
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+			topic = 'Opinion'
+		
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+			
+			if ( this_date == self.req_date ):
+				item = NoticiasItem()
+				item['date'] = line['publishDate']
+				item['topic'] = topic
+				item['title'] = line['name']
+				if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+					request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
+				else:
+					request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
+				request.meta['item'] = item
+				yield request
+
+
+	def parse_item(self, response): # para los json
+		item = response.meta['item']
+		text = ''
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
+
--- a/sitios_yucatan/diarioYucatan/diarioYucatan/settings.py
+++ b/sitios_yucatan/diarioYucatan/diarioYucatan/settings.py
@@ -24,7 +24,7 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY=2
+# DOWNLOAD_DELAY=3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN=16
 #CONCURRENT_REQUESTS_PER_IP=16

--- a/sitios_yucatan/diarioYucatan/diarioYucatan/settings.pyc
+++ b/sitios_yucatan/diarioYucatan/diarioYucatan/settings.pyc
--- a/sitios_yucatan/diarioYucatan/diarioYucatan/spiders/noticias.py
+++ b/sitios_yucatan/diarioYucatan/diarioYucatan/spiders/noticias.py
 import scrapy


-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 -a month=12 -a day=24
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 -a month=12 -a day=24

 import re

 from datetime import datetime, date, timedelta
+from scrapy.spidermiddlewares.httperror import HttpError

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -91,6 +92,7 @@ class QuotesSpider(scrapy.Spider):
 			yield scrapy.Request(url=self.baseURL+s, callback=self.parse)


+
 	def parse(self, response):
 		if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
 			for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
@@ -100,6 +102,13 @@ class QuotesSpider(scrapy.Spider):
 			yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)


+	def errback_http(self, failure):	
+		if failure.check(HttpError):
+			response = failure.value.response
+			self.logger.error('HttpError on %s', response.url)
+			self.stop = True
+
+
 	def parse_pagination(self, response):
 		pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
 		if ( len(pagination) > 0 ):
@@ -127,7 +136,7 @@ class QuotesSpider(scrapy.Spider):
 			entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
 			entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]

-			news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:1]))
+			news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
 			link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
 			
 			if news_date == self.date and link is not None:

--- a/sitios_yucatan/diarioYucatan/diarioYucatan/spiders/noticias.pyc
+++ b/sitios_yucatan/diarioYucatan/diarioYucatan/spiders/noticias.pyc
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/__init__.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/__init__.py
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/__init__.pyc
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/__init__.pyc
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/items.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class Diarioyucatan2Item(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/middlewares.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class Diarioyucatan2SpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/pipelines.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class Diarioyucatan2Pipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/settings.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for diarioYucatan2 project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'diarioYucatan2'
+
+SPIDER_MODULES = ['diarioYucatan2.spiders']
+NEWSPIDER_MODULE = 'diarioYucatan2.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioYucatan2 (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 2
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioYucatan2.middlewares.Diarioyucatan2SpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioYucatan2.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'diarioYucatan2.pipelines.Diarioyucatan2Pipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/settings.pyc
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/settings.pyc
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/__init__.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/__init__.pyc
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/__init__.pyc
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/noticias.py
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/noticias.py
+import scrapy
+
+
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 -a month=12 -a day=24
+
+import re
+
+from datetime import datetime, date, timedelta
+from scrapy.spidermiddlewares.httperror import HttpError
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+
+
+class NoticiasItem(scrapy.Item):
+	title = scrapy.Field()
+	text = scrapy.Field()
+	date = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
+	
+
+class QuotesSpider(scrapy.Spider):
+	name = "noticias"
+
+	def start_requests(self):
+		section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
+										'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
+
+		year = getattr(self, 'year', None)
+		month = getattr(self, 'month', None)
+		day = getattr(self, 'day', None)
+		self.baseURL='http://yucatan.com.mx/seccion/'
+		self.date = date(int(year), int(month), int(day))
+		self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
+													 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
+		self.stop = False
+		urls = [
+			self.baseURL,
+								]
+		for s in section_list:
+			yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
+
+
+
+	def parse(self, response):
+		if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
+			for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
+				yield scrapy.Request(url=link, callback=self.parse_pagination)
+			
+		elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
+			yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
+
+
+	def errback_http(self, failure):	
+		if failure.check(HttpError):
+			response = failure.value.response
+			self.logger.error('HttpError on %s', response.url)
+			self.stop = True
+
+
+	def parse_pagination(self, response):
+		pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
+		if ( len(pagination) > 0 ):
+			pagination = pagination[-1]
+			pages = int(pagination[pagination.rfind('/')+1:])
+			p = 1
+
+			while p <= pages:	
+				if ( self.stop ):
+					p = pages+1
+				else:
+					if ( p == 1 ):
+						yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
+					elif ( p > 1 ):
+						yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
+
+				p += 1
+
+		else:
+			yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
+
+
+	def parse_link(self, response):
+		for entry in response.xpath('//*[@class="bp-entry"]'):
+			entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
+			entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
+
+			news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
+			link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
+			
+			if news_date >= self.date and link is not None:
+				yield scrapy.Request(url=link, callback=self.parse_item)
+
+			elif news_date < self.date:
+				self.stop = True
+
+
+
+	def parse_item(self, response):
+		text = ''
+		item = NoticiasItem()
+		item['title'] = response.css('h1.entry-title::text').extract_first()
+		item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
+		for paragraph in response.css('div.entry-content').css('p').extract():
+			text += remove_tags(paragraph) + '\n'		
+		item['text'] = text
+		item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
+		item['url'] = response.url
+		# print item['title']
+		yield item
+
--- a/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
+++ b/sitios_yucatan/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
--- a/sitios_yucatan/diarioYucatan2/scrapy.cfg
+++ b/sitios_yucatan/diarioYucatan2/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = diarioYucatan2.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = diarioYucatan2
--- a/sitios_yucatan/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+++ b/sitios_yucatan/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+from scrapy.spidermiddlewares.httperror import HttpError
+import scrapy
+
+#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+
+import re
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+
+class NoticiasItem(scrapy.Item):
+	title = scrapy.Field()
+	text = scrapy.Field()
+	date = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
+
+# class QuotesSpider(scrapy.Spider):
+# 	name = "noticias"
+# 	def start_requests(self):
+# 		year = getattr(self, 'year', None)
+# 		month = getattr(self, 'month', None)
+# 		day = getattr(self, 'day', None)
+# 		self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day
+# 		urls = [
+# 			self.baseURL,
+# 								]
+# 		for url in urls:
+# 			yield scrapy.Request(url=url, callback=self.parse)
+
+# 	def parse(self, response):
+# 		pagination = response.css("div.pagination_div").css("a::attr(href)").extract()
+# 		if ( len(pagination) > 0 ):
+# 			pagination = pagination[-2].strip('/')
+# 			pages = int(pagination[pagination.rfind('/')+1:])
+# 			for page in range(0,pages):
+# 				if ( page == 0 ):
+# 					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+# 				else:
+# 					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+# 		else:
+# 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
+# 	def parse_page(self, response):
+# 		item = NoticiasItem()
+# 		for post in response.css('div.col-md-12').css('div.cp-post-content'):
+# 			item['topic'] = post.css('ul.cp-post-tools').css('li').css('a::attr(title)').extract()
+# 			item['author'] = post.css('ul.cp-post-tools').xpath('./li[2]/text()').extract_first()
+# 			request = scrapy.Request(url=post.xpath('./h3/a/@href').extract_first(), callback=self.parse_item)
+# 			request.meta['item'] = item
+# 			yield request
+
+# 	def parse_item(self, response):
+# 		item = response.meta['item']
+# 		text = ''
+# 		item['title'] = response.css("div.cp-post-content").css("h1").css("a::text").extract_first()
+# 		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+# 		for paragraph in response.css("div.cp-post-content").css("p").extract():
+# 			text += remove_tags(paragraph) + '\n'
+# 		item['text'] = text
+# 		item['url'] = response.url
+# 		# print item['title']
+# 		yield item
+
+class QuotesSpider(scrapy.Spider):
+	# handle_httpstatus_list = [404]
+	name = "noticias"
+	
+	def start_requests(self):
+		year = getattr(self, 'year', None)
+		month = getattr(self, 'month', None)
+		day = getattr(self, 'day', None)
+		self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day
+		self.stop = False
+		page = 0
+
+		while not self.stop:
+		# for page in range(0, 50):
+			if page == 0:
+				yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
+			elif page > 0:
+				yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
+
+			page += 1
+
+
+	def parse(self, response):
+		# if response.status == 404:
+		# 	print('**********hey, 404! TRUE!!!')
+		# 	self.stop = True
+		# else:
+		for link in response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract():
+			yield scrapy.Request(url=link, callback=self.parse_item)
+
+
+	def errback_http(self, failure):	
+		if failure.check(HttpError):
+			response = failure.value.response
+			self.logger.error('HttpError on %s', response.url)
+			self.stop = True
+
+
+	def parse_item(self, response):
+		item = NoticiasItem()
+		text = ''
+		item['date'] = response.xpath('//meta[@property="DC.date.issued"]/@content').extract_first()
+		item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
+		item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
+		for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		# print item['title']
+		yield item
\ No newline at end of file