Commit 9d0fd052 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

actualizadas fecha + zona horaria

parent a55f3c4d
......@@ -8,7 +8,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=09 -a day=13
No es recomendable para fechas de más de un mes de antiguas.
No es recomendable para fechas de mas de un mes de antiguas.
TAG_RE = re.compile(r'<[^>]+>')
......@@ -89,11 +89,17 @@ class QuotesSpider(scrapy.Spider):
text = ''
item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if item['date'] is None:
item['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if item['date'] is None:
item['date'] = response.xpath('//time[@class="updated"]/@datetime').extract_first()
d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if d is None:
d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
......@@ -93,11 +93,17 @@ class QuotesSpider(scrapy.Spider):
text = ''
item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if item['date'] is None:
item['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if item['date'] is None:
item['date'] = response.xpath('//time[@class="updated"]/@datetime').extract_first()
d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if d is None:
d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
......@@ -7,7 +7,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=2 -a day=21
No es recomendable para fechas de más de un mes de antiguas.
No es recomendable para fechas de mas de un mes de antiguas.
TAG_RE = re.compile(r'<[^>]+>')
......@@ -80,7 +80,10 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if len(title) > 0:
item['title'] = title[0]
......@@ -92,5 +95,6 @@ class QuotesSpider(scrapy.Spider):
text += paragraph
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -73,6 +73,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de las noticias ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if ( len(title) > 0 ):
import scrapy, json, re
from datetime import datetime, date, timedelta
from datetime import datetime, date, timedelta, tzinfo
Esta version descarga ingresando una fecha.
......@@ -7,7 +7,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
No es recomendable para fechas de más de un mes de antiguas.
No es recomendable para fechas de mas de un mes de antiguas.
......@@ -16,6 +16,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -32,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self):
# self.found = False
# self.flag = False = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) = getattr(self, 'day', None)
......@@ -83,7 +96,21 @@ class QuotesSpider(scrapy.Spider):
for link in link_list:
if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-' ):
item = NoticiasItem()
item['date'] = link[:link.rfind('/')]
d = link[:link.rfind('/')]
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2],'T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],'T')
item['date'] = d
item['topic'] = response.url[response.url.rfind('/')+1:].title()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
......@@ -144,7 +171,21 @@ class QuotesSpider(scrapy.Spider):
if ( this_date == self.req_date ):
item = NoticiasItem()
item['date'] = line['publishDate']
# item['date'] = line['publishDate']
d = line['publishDate']
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2],'T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],'T')
item['date'] = d
item['topic'] = topic
item['title'] = line['name']
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
import scrapy, json, re
from datetime import datetime, date, timedelta, tzinfo
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
......@@ -17,6 +18,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -31,11 +44,12 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self): = UTC()
self.baseURL = ''
# section_list = [('yucatan',123,'Yucatan'), ('quintana-roo',52,'Quintana Roo'),
# ('campeche',32,'Campeche'), ('opinion',0,'Opinion'), ('deportes',91,'Deportes'),
# ('nacional',100,'Nacional'), ('internacional',87,'Internacional')]
section_list = [('opinion',0,'Opinion')]
section_list = [('yucatan',123,'Yucatan'), ('quintana-roo',52,'Quintana Roo'),
('campeche',32,'Campeche'), ('opinion',0,'Opinion'), ('deportes',91,'Deportes'),
('nacional',100,'Nacional'), ('internacional',87,'Internacional')]
# section_list = [('opinion',0,'Opinion')]
for section in section_list:
self.section = section
......@@ -56,7 +70,20 @@ class QuotesSpider(scrapy.Spider):
for line in json_list:
item = NoticiasItem()
item['date'] = line['publishDate']
d = line['publishDate']
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2],'T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],'T')
item['date'] = d
item['topic'] = self.section[2]
item['title'] = line['name']
if not ( self.section[0] == 'opinion' ):
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,41 +18,52 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('').css('a.last::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[0].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('').css('h1.entry-title::text').extract_first()
item['date'] = response.css('').css('time.entry-date::attr(datetime)').extract_first()
d = response.css('').css('time.entry-date::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('').css('a::text').extract_first()
for paragraph in response.css('').css('p').extract():
text += remove_tags(paragraph) + '\n'
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,46 +18,59 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
item['date'] = response.css('').css('span').css('time::attr(datetime)').extract_first()
d = response.css('').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
#print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -21,28 +21,30 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......@@ -55,14 +57,17 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -21,16 +21,14 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
......@@ -42,13 +40,16 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
item['author'] = response.xpath('//span[@itemprop="name"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
......@@ -22,16 +21,14 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
......@@ -56,9 +53,10 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
import scrapy
import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -26,11 +25,8 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
......@@ -55,7 +51,13 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
if d[-6:] != '-05:00' and d[-6:] != '-06:00' :
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
......@@ -54,7 +54,12 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
item['title'] = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
from scrapy.spidermiddlewares.httperror import HttpError
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -77,10 +76,19 @@ class QuotesSpider(scrapy.Spider):
self.stop = False
page = 0
# while not self.stop:
# # for page in range(0, 50):
# if page == 0:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
# elif page > 0:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1
while not self.stop:
# for page in range(0, 50):
if page == 0:
yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
elif page > 0:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
......@@ -92,7 +100,11 @@ class QuotesSpider(scrapy.Spider):
# print('**********hey, 404! TRUE!!!')
# self.stop = True
# else:
for link in response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract():
link_list = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
if len(link_list) <= 0:
link_list = response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract()
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
......@@ -106,14 +118,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if item['date'] is None or item['date'] == '':
item['date'] = response.xpath('//meta[@property=""]/@content').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if d is None or d == '':
d = response.xpath('//meta[@property=""]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
\ No newline at end of file
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
......@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider):
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css('').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
......@@ -43,21 +47,32 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('').css('h1.entry-title::text').extract_first()
item['date'] = response.css('').css('').css('time::attr(datetime)').extract_first()
d = response.css('').css('').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
item['author'] = response.css('').css('a::text').extract_first()
for paragraph in response.css('').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
......@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider):
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css('').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
......@@ -43,20 +47,30 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('').css('h1.entry-title::text').extract_first()
item['date'] = response.css('').css('').css('time::attr(datetime)').extract_first()
d = response.css('').css('').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
for paragraph in response.css('').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,6 +18,7 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
......@@ -24,11 +26,9 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
......@@ -37,16 +37,25 @@ class QuotesSpider(scrapy.Spider):
next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('h1.title::text').extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['author'] = response.css('').css('a::text').extract_first()
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
for paragraph in response.css('').css('p').extract():
text += remove_tags(paragraph)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,32 +17,36 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for post in response.css('div.mosaicflow').css(''):
item = NoticiasItem()
......@@ -50,16 +54,24 @@ class QuotesSpider(scrapy.Spider):
item['title'] = post.xpath('./h1/a/@title').extract_first()
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
item['date'] = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,46 +17,60 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.main_container').css('').extract_first()
item['date'] = response.css('').css('span').css('time::attr(datetime)').extract_first()
d = response.css('').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -19,30 +19,34 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
urls = [
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.list-block').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
......@@ -51,12 +55,20 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem()
text = ''
item['title'] = response.css('').css('h1.entry-title::text').extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2]
for paragraph in response.css('div.post_content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
import re
import scrapy, re
from datetime import datetime, date, timedelta, tzinfo
from scrapy_splash import SplashRequest
......@@ -14,6 +14,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para el centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -28,12 +40,13 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self): = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
parse_month = {'1': 'Enero', '2': 'Febrero', '3': 'Marzo', '4': 'Abril',
'5': 'Mayo', '6': 'Junio', '7': 'Julio', '8': 'Agosto',
'9': 'Septiembre', '10': 'Octubre', '11': 'Noviembre', '12': 'Diciembre'}
parse_month = {'1': 'enero', '2': 'febrero', '3': 'marzo', '4': 'abril',
'5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'} = parse_month[month]+' de '+year
......@@ -46,6 +59,7 @@ class QuotesSpider(scrapy.Spider):
for post in response.css('div.catpor-box'):
post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first()
post_date = post_date[post_date.find('d')+3:]
if post_date ==
link = post.xpath('./div/div/a/@href').extract_first()
yield scrapy.Request(url=link, callback=self.parse_2)
......@@ -60,13 +74,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
d = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],'T')
item['date'] = d
item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first()
item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first()
for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment