crawlers

11c4aa01 · Renán Sosa Guillen · f1dfa7e9 · 11c4aa01 · 11c4aa01 · f1dfa7e9
Commit 11c4aa01 authored Aug 25, 2017 by Renán Sosa Guillen
3 changed files
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
@@ -24,27 +24,213 @@ class NoticiasItem(scrapy.Item):
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
-		self.year = getattr(self, 'year', None)
-		self.month = getattr(self, 'month', None)
-		self.day = getattr(self, 'day', None)
-		self.baseURL='http://www.jornada.unam.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)+'/'
+		year = getattr(self, 'year', None)
+		month = getattr(self, 'month', None)
+		day = getattr(self, 'day', None)
+		self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)+'/'
 		
-		comparison_date = date(2009, 2, 15)
-		requested_date = date(int(self.year), int(self.month), int(self.day))
+		self.comparison_date_1 = date(2001, 12, 7)
+		self.comparison_date_2 = date(2002, 1, 8)
+		self.comparison_date_3 = date(2003, 4, 25)
+		self.comparison_date_4 = date(2004, 11, 16)
+		self.comparison_date_5 = date(2004, 12, 12)
+		self.comparison_date_6 = date(2005, 1, 31)
+		self.comparison_date_7 = date(2009, 2, 15)
+		self.date = date(int(year), int(month), int(day))

-		section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
-						'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
+		# self.section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
+		# 										 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']

-		for section in section_list:
+		# for section in section_list:
 			# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
 			# para las fechas mayores a esa la estructura cambia
-			if ( requested_date <= comparison_date ):
-				yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
+			# if ( requested_date <= comparison_date_1 ):
+			# 	yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
+			# else:
+			# 	yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
+		if self.date <= self.comparison_date_2:
+			section_list = ['index.html', 'edito.html', 'opinion.html', 'correo.html', 'politica.html',
+											'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
+											'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html']
+
+			parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
+								 'correo.html': 'Correo', 'politica.html': 'Politica', 'economia.html': 'Economia',
+								 'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
+								 'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
+								 'deportes.html': 'Deportes'}
+				
+			for s in section_list:
+				item = NoticiasItem()
+				item['date'] = self.date
+				item['topic'] = parse_s[s]
+				
+				if s == 'edito.html' or s == 'correo.html':
+					request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
 				else:
-				yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
+					request = scrapy.Request(url=self.baseURL+s, callback=self.parse)
+				
+				request.meta['item'] = item				
+				yield request
+
+		elif self.date > self.comparison_date_2 and self.date <= self.comparison_date_3:
+			section_list = ['index.html', 'edito.html', 'opinion.html', 'correo.html', 'politica.html',
+											'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
+											'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html',
+											'index.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
+											'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
+											'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php']
+
+			parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
+								 'correo.html': 'Correo', 'politica.html': 'Politica', 'economia.html': 'Economia',
+								 'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
+								 'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
+								 'deportes.html': 'Deportes',
+								 'index.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
+								 'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
+								 'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
+								 'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
+								 'deportes.php': 'Deportes'}
+
+			for s in section_list:
+				item = NoticiasItem()
+				item['date'] = self.date
+				item['topic'] = parse_s[s]
+				
+				if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
+					request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
+				else:
+					request = scrapy.Request(url=self.baseURL+s, callback=self.parse_2)
+				
+				request.meta['item'] = item
+				yield request
+
+		elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
+			section_list = ['indexfla.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
+											'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
+											'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php', 'index.php']
+
+			parse_s = {'indexfla.php': 'Portada', 'edito.php': 'Editorial',           'opinion.php': 'Opinion',
+								 'correo.php': 'Correo',    'politica.php': 'Politica',         'economia.php': 'Economia',
+								 'cultura.php': 'Cultura',  'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
+								 'capital.php': 'Capital',  'mundo.php': 'Mundo',               'soc-jus.php': 'Sociedad',
+								 'deportes.php': 'Deportes','index.php': 'Portada'}
+
+			for s in section_list:
+				item = NoticiasItem()
+				item['date'] = self.date
+				item['topic'] = parse_s[s]
+
+				if s == 'edito.php' or s == 'correo.php':
+					if self.date > self.comparison_date_3 and self.date <= self.comparison_date_5:
+						request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
+
+					elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
+						request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item_2)
+
+				else:
+					request = scrapy.Request(url=self.baseURL+s, callback=self.parse_3)
+				
+				request.meta['item'] = item
+				yield request
+
+		elif self.date > self.comparison_date_6:
+			section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
+											'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
+
+			for s in section_list:
+				# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
+				# para las fechas mayores a esa la estructura cambia
+				if self.date <= self.comparison_date_7:
+					yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
+				elif self.date > self.comparison_date_7:
+					yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
+


 	def parse(self, response):
+		item = response.meta['item']
+		if self.date <= self.comparison_date_1:
+			if item['topic'] == 'Portada':
+				path  = '//td[@rowspan="3"]'
+			else:
+				if len(response.xpath('//td[@align="center"]').css('a::attr(href)').extract()) > 0:
+					path = '//td[@align="center"]'
+				else:
+					path = '//td[@align="CENTER"]'
+
+		elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_2:
+			if item['topic'] == 'Portada':
+				path = '//empieza'
+			else:
+				path = '//table[@bordercolor="#CCCCCC"]' 
+			
+		for r in response.xpath(path).css('a::attr(href)').extract():
+			if r[-5:] == '.html':
+				request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
+				request.meta['item'] = item
+				yield request
+
+
+
+	def parse_2(self, response):
+		item = response.meta['item']
+		
+		for r in response.xpath('//table[@bordercolor="#CCCCCC"]').css('a::attr(href)').extract():
+			if r[-5:] == '.html':
+				request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
+				request.meta['item'] = item
+				yield request
+
+
+
+	def parse_3(self, response):
+		item = response.meta['item']
+		link_list = []
+		link_list.extend(response.xpath('//td[@width="100%"]').css('a::attr(href)').extract())
+		link_list.extend(response.xpath('//td[@width="52%"]').css('a::attr(href)').extract())
+		link_list.extend(response.xpath('//td[@width="24%"]').css('a::attr(href)').extract())
+		link_list.extend(response.xpath('//td[@width="646"]').css('a::attr(href)').extract())
+		link_list.extend(response.xpath('//table[@width="100%"]').css('a::attr(href)').extract())
+		
+		for r in link_list:
+			if r[-11:] == '.html&fly=1' or r[-9:] == '.php&fly=' or r[-4:] == '.php':
+				if self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
+					if self.date <= self.comparison_date_4:
+						request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
+						request.meta['item'] = item
+						yield request
+
+					elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_6:
+						if r[:4] == 'http' and r[-4:] == '.php':
+							this_url = r.replace('\n','')
+
+							if self.date <= self.comparison_date_5: 
+								request = scrapy.Request(url=this_url, callback=self.parse_item)
+							elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
+								request = scrapy.Request(url=this_url, callback=self.parse_item_2)
+							
+							request.meta['item'] = item
+							yield request
+		
+				# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
+				# 	request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
+				# 	request.meta['item'] = item
+				# 	yield request
+
+
+
+	def parse_4(self, response):
+		print response.url
+		for r in response.xpath('//td[@width="646"]').css('a::attr(href)').extract():
+			if r[-4:] == '.php':
+				print r.replace('\n','')
+				# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
+				# request.meta['item'] = item
+				# yield request
+
+
+
+	def parse_5(self, response):
 		if ( response.url[:response.url.rfind('/')+1] == self.baseURL ):   # verifica que se conserva la misma URL base
 			section = response.url[response.url.rfind('/')+1:]
 			if ( section == 'opinion' ):   # la seccion 'opinion' tiene una estructura diferente a las otras
@@ -56,49 +242,183 @@ class QuotesSpider(scrapy.Spider):

 			for path in path_list:
 				for link in response.xpath(path).extract():
-					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
+					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_3)


-	def parse_2(self, response):
+
+	def parse_6(self, response):
 		if ( response.url[:response.url.rfind('/')+1] == self.baseURL ):
 			path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
 						 			 '//*[@class="item"]/div/a/@href']

 			for path in path_list:
 				for link in response.xpath(path).extract():
-					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_2)
+					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
+


 	def parse_item(self, response):
+		item = response.meta['item']
+		flag = True
+		text = ''
+		try:
+			title = response.xpath('//font[@size="5"]').extract_first()
+			item['title'] = remove_tags(title)
+		except:
+			try:
+				title = response.xpath('//p/font[@size="5"]').extract_first()
+				item['title'] = remove_tags(title)
+			except:
+				try:
+					title = response.xpath('//p/font[@size="5"]').extract()[1]
+					item['title'] = remove_tags(title)
+				except:
+					try:
+						title = response.xpath('//font[@size="4"]').extract_first()
+						item['title'] = remove_tags(title)
+					except:
+						try:
+							title = response.xpath('//p/font[@size="4"]').extract_first()
+							item['title'] = remove_tags(title)
+						except:
+							try:
+								title = response.xpath('//p/font[@size="4"][1]').extract()[1]
+								item['title'] = remove_tags(title)
+							except:
+								try:
+									title = response.xpath('//font[@size="3"]').extract_first()
+									item['title'] = remove_tags(title)
+								except:
+									try:
+										title = response.xpath('//p/font[@size="3"]').extract_first()
+										item['title'] = remove_tags(title)
+									except:
+										try:
+											title = response.xpath('//p/font[@size="3"][1]').extract()[1]
+											item['title'] = remove_tags(title)
+										except:
+											try:
+												title = response.xpath('//font[@size="+1"]').extract_first()
+												item['title'] = remove_tags(title)
+											except:
+												try:
+													title = response.xpath('//font[@size="+0"]').extract_first()
+													item['title'] = remove_tags(title)
+												except:
+													if self.date <= date(1999, 10, 3):   # en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
+														try:
+															title = remove_tags(response.xpath('//center').extract_first())
+															item['title'] = title
+															flag = False
+														except:
+															pass
+													else:
+														pass
+
+
+		if flag:
+			if self.date <= self.comparison_date_1:
+				for p in response.css('p').extract():
+					text += remove_tags(p).replace('\r','') ## no toma en cuenta los primeros indices donde esta el titulo
+					text = text.replace('\t','')
+
+			elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_3:
+				for p in response.xpath('//table[@bordercolor="#CCCCCC"]').css('p').extract():
+					text += remove_tags(p).replace('\r','')
+					text = text.replace('\t','')
+
+			elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_4:
+				p = response.css('p').extract()
+				for i in range(0, len(p)):
+					text += remove_tags(p[i]).replace('\r','')
+					text = text.replace('\t','')
+
+			elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_5:
+				p = response.css('p').extract()
+				for i in range(3, len(p)):
+					text += remove_tags(p[i]).replace('\r','')
+					text = text.replace('\t','')
+				if text == '':
+					for i in range(0, len(p)):
+						text += remove_tags(p[i]).replace('\r','')
+						text = text.replace('\t','')
+		
+		else:
+			text = remove_tags(response.body)
+			text = text[len(title):]
+		
+		item['text'] = text
+		item['url'] = response.url
+		yield item
+
+
+
+	def parse_item_2(self, response):
+		item = response.meta['item']
+		text = ''
+		title_list = []
+		title_list.extend(response.xpath('//*[@id="contenido"]/h1/text()').extract())
+		title_list.extend(response.xpath('//h1/text()').extract())
+
+		for t in title_list:
+			if t is not None or t != '':
+				title = remove_tags(t).replace('\r','')
+				title = title.replace('\t','')
+				item['title'] = title
+
+		p = response.css('p').extract()
+		for i in range(4, len(p)):
+			text += remove_tags(p[i]).replace('\r','')
+			text = text.replace('\t','')
+		
+		if text == '':
+			for i in range(0, len(p)):
+				text += remove_tags(p[i]).replace('\r','')
+				text = text.replace('\t','')
+
+		item['text'] = text
+		item['url'] = response.url
+		yield item	
+
+
+
+	def parse_item_3(self, response):
 		item = NoticiasItem()
 		text = ''
-		item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+		item['date'] = self.date
 		title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
+		
 		if ( len(title) > 0 ):
 			item['title'] = title[0]
 		else:
 			item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
-		item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
-		for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
-			text += paragraph
+		
+		item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract_first()
+		for p in response.xpath('//*[@class="documentContent"]/p').extract():
+			text += remove_tags(p).replace('\r','')
+			text = text.replace('\t','')
+
 		item['text'] = text
 		item['url'] = response.url
 		# print item['title']
 		yield item


-	def parse_item_2(self, response):
+	def parse_item_4(self, response):
 		item = NoticiasItem()
 		text = ''
-		path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
+		# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
+		path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
 		
-		item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+		item['date'] = self.date
 		item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
-		item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract()
+		item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()

 		for path in path_list:
-			for paragraph in response.xpath(path).extract():
-				text += remove_tags(paragraph)
+			for p in response.xpath(path).extract():
+				text += remove_tags(p).replace('\r','')
+				text = text.replace('\t','')
+		
 		item['text'] = text
 		item['url'] = response.url
 		# print item['title']

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
--- a/descarga_por_dia/laJornada/noticias.json
+++ b/descarga_por_dia/laJornada/noticias.json