Commit 11c4aa01 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent f1dfa7e9
......@@ -24,27 +24,213 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)+'/'
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)+'/'
comparison_date = date(2009, 2, 15)
requested_date = date(int(self.year), int(self.month), int(self.day))
self.comparison_date_1 = date(2001, 12, 7)
self.comparison_date_2 = date(2002, 1, 8)
self.comparison_date_3 = date(2003, 4, 25)
self.comparison_date_4 = date(2004, 11, 16)
self.comparison_date_5 = date(2004, 12, 12)
self.comparison_date_6 = date(2005, 1, 31)
self.comparison_date_7 = date(2009, 2, 15)
self.date = date(int(year), int(month), int(day))
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
# self.section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
# 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for section in section_list:
# for section in section_list:
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if ( requested_date <= comparison_date ):
yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
# if ( requested_date <= comparison_date_1 ):
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
# else:
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
if self.date <= self.comparison_date_2:
section_list = ['index.html', 'edito.html', 'opinion.html', 'correo.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'correo.html': 'Correo', 'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes'}
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
request = scrapy.Request(url=self.baseURL+s, callback=self.parse)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_2 and self.date <= self.comparison_date_3:
section_list = ['index.html', 'edito.html', 'opinion.html', 'correo.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html',
'index.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'correo.html': 'Correo', 'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes',
'index.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes'}
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_2)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
section_list = ['indexfla.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php', 'index.php']
parse_s = {'indexfla.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes','index.php': 'Portada'}
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_5:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item_2)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_3)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_6:
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for s in section_list:
# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if self.date <= self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
elif self.date > self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
def parse(self, response):
item = response.meta['item']
if self.date <= self.comparison_date_1:
if item['topic'] == 'Portada':
path = '//td[@rowspan="3"]'
else:
if len(response.xpath('//td[@align="center"]').css('a::attr(href)').extract()) > 0:
path = '//td[@align="center"]'
else:
path = '//td[@align="CENTER"]'
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_2:
if item['topic'] == 'Portada':
path = '//empieza'
else:
path = '//table[@bordercolor="#CCCCCC"]'
for r in response.xpath(path).css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_2(self, response):
item = response.meta['item']
for r in response.xpath('//table[@bordercolor="#CCCCCC"]').css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
link_list = []
link_list.extend(response.xpath('//td[@width="100%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="52%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="24%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="646"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//table[@width="100%"]').css('a::attr(href)').extract())
for r in link_list:
if r[-11:] == '.html&fly=1' or r[-9:] == '.php&fly=' or r[-4:] == '.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
if self.date <= self.comparison_date_4:
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_6:
if r[:4] == 'http' and r[-4:] == '.php':
this_url = r.replace('\n','')
if self.date <= self.comparison_date_5:
request = scrapy.Request(url=this_url, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=this_url, callback=self.parse_item_2)
request.meta['item'] = item
yield request
# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
# request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
# request.meta['item'] = item
# yield request
def parse_4(self, response):
print response.url
for r in response.xpath('//td[@width="646"]').css('a::attr(href)').extract():
if r[-4:] == '.php':
print r.replace('\n','')
# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
# request.meta['item'] = item
# yield request
def parse_5(self, response):
if ( response.url[:response.url.rfind('/')+1] == self.baseURL ): # verifica que se conserva la misma URL base
section = response.url[response.url.rfind('/')+1:]
if ( section == 'opinion' ): # la seccion 'opinion' tiene una estructura diferente a las otras
......@@ -56,49 +242,183 @@ class QuotesSpider(scrapy.Spider):
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_3)
def parse_2(self, response):
def parse_6(self, response):
if ( response.url[:response.url.rfind('/')+1] == self.baseURL ):
path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
'//*[@class="item"]/div/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_2)
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
def parse_item(self, response):
item = response.meta['item']
flag = True
text = ''
try:
title = response.xpath('//font[@size="5"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="5"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="5"]').extract()[1]
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//font[@size="4"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="4"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="4"][1]').extract()[1]
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//font[@size="3"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="3"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//p/font[@size="3"][1]').extract()[1]
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//font[@size="+1"]').extract_first()
item['title'] = remove_tags(title)
except:
try:
title = response.xpath('//font[@size="+0"]').extract_first()
item['title'] = remove_tags(title)
except:
if self.date <= date(1999, 10, 3): # en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
try:
title = remove_tags(response.xpath('//center').extract_first())
item['title'] = title
flag = False
except:
pass
else:
pass
if flag:
if self.date <= self.comparison_date_1:
for p in response.css('p').extract():
text += remove_tags(p).replace('\r','') ## no toma en cuenta los primeros indices donde esta el titulo
text = text.replace('\t','')
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_3:
for p in response.xpath('//table[@bordercolor="#CCCCCC"]').css('p').extract():
text += remove_tags(p).replace('\r','')
text = text.replace('\t','')
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_4:
p = response.css('p').extract()
for i in range(0, len(p)):
text += remove_tags(p[i]).replace('\r','')
text = text.replace('\t','')
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_5:
p = response.css('p').extract()
for i in range(3, len(p)):
text += remove_tags(p[i]).replace('\r','')
text = text.replace('\t','')
if text == '':
for i in range(0, len(p)):
text += remove_tags(p[i]).replace('\r','')
text = text.replace('\t','')
else:
text = remove_tags(response.body)
text = text[len(title):]
item['text'] = text
item['url'] = response.url
yield item
def parse_item_2(self, response):
item = response.meta['item']
text = ''
title_list = []
title_list.extend(response.xpath('//*[@id="contenido"]/h1/text()').extract())
title_list.extend(response.xpath('//h1/text()').extract())
for t in title_list:
if t is not None or t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
p = response.css('p').extract()
for i in range(4, len(p)):
text += remove_tags(p[i]).replace('\r','')
text = text.replace('\t','')
if text == '':
for i in range(0, len(p)):
text += remove_tags(p[i]).replace('\r','')
text = text.replace('\t','')
item['text'] = text
item['url'] = response.url
yield item
def parse_item_3(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = self.date
title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
if ( len(title) > 0 ):
item['title'] = title[0]
else:
item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
text += paragraph
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract_first()
for p in response.xpath('//*[@class="documentContent"]/p').extract():
text += remove_tags(p).replace('\r','')
text = text.replace('\t','')
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
def parse_item_2(self, response):
def parse_item_4(self, response):
item = NoticiasItem()
text = ''
path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = self.date
item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract()
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
for path in path_list:
for paragraph in response.xpath(path).extract():
text += remove_tags(paragraph)
for p in response.xpath(path).extract():
text += remove_tags(p).replace('\r','')
text = text.replace('\t','')
item['text'] = text
item['url'] = response.url
# print item['title']
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment