Commit 9d0fd052 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

actualizadas fecha + zona horaria

parent a55f3c4d
...@@ -8,7 +8,7 @@ USO: ...@@ -8,7 +8,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=09 -a day=13 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=09 -a day=13
No es recomendable para fechas de más de un mes de antiguas. No es recomendable para fechas de mas de un mes de antiguas.
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -89,11 +89,17 @@ class QuotesSpider(scrapy.Spider): ...@@ -89,11 +89,17 @@ class QuotesSpider(scrapy.Spider):
text = '' text = ''
item = NoticiasItem() item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first() item['title'] = response.css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if item['date'] is None: d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
item['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first() if d is None:
if item['date'] is None: d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
item['date'] = response.xpath('//time[@class="updated"]/@datetime').extract_first() if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -93,11 +93,17 @@ class QuotesSpider(scrapy.Spider): ...@@ -93,11 +93,17 @@ class QuotesSpider(scrapy.Spider):
text = '' text = ''
item = NoticiasItem() item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first() item['title'] = response.css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if item['date'] is None: d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
item['date'] = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first() if d is None:
if item['date'] is None: d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
item['date'] = response.xpath('//time[@class="updated"]/@datetime').extract_first() if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
...@@ -7,7 +7,7 @@ USO: ...@@ -7,7 +7,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=2 -a day=21 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=2 -a day=21
No es recomendable para fechas de más de un mes de antiguas. No es recomendable para fechas de mas de un mes de antiguas.
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -80,7 +80,10 @@ class QuotesSpider(scrapy.Spider): ...@@ -80,7 +80,10 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract() title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if len(title) > 0: if len(title) > 0:
item['title'] = title[0] item['title'] = title[0]
...@@ -92,5 +95,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -92,5 +95,6 @@ class QuotesSpider(scrapy.Spider):
text += paragraph text += paragraph
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
...@@ -73,6 +73,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -73,6 +73,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de las noticias ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract() title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if ( len(title) > 0 ): if ( len(title) > 0 ):
......
import scrapy, json, re import scrapy, json, re
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta, tzinfo
""" """
Esta version descarga ingresando una fecha. Esta version descarga ingresando una fecha.
...@@ -7,7 +7,7 @@ USO: ...@@ -7,7 +7,7 @@ USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
No es recomendable para fechas de más de un mes de antiguas. No es recomendable para fechas de mas de un mes de antiguas.
""" """
...@@ -16,6 +16,18 @@ def remove_tags(text): ...@@ -16,6 +16,18 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -32,6 +44,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -32,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
# self.found = False # self.found = False
# self.flag = False # self.flag = False
self.tz = UTC()
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
...@@ -83,7 +96,21 @@ class QuotesSpider(scrapy.Spider): ...@@ -83,7 +96,21 @@ class QuotesSpider(scrapy.Spider):
for link in link_list: for link in link_list:
if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ): if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
item = NoticiasItem() item = NoticiasItem()
item['date'] = link[:link.rfind('/')]
d = link[:link.rfind('/')]
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['topic'] = response.url[response.url.rfind('/')+1:].title() item['topic'] = response.url[response.url.rfind('/')+1:].title()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2) # yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2) request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
...@@ -144,7 +171,21 @@ class QuotesSpider(scrapy.Spider): ...@@ -144,7 +171,21 @@ class QuotesSpider(scrapy.Spider):
if ( this_date == self.req_date ): if ( this_date == self.req_date ):
item = NoticiasItem() item = NoticiasItem()
item['date'] = line['publishDate'] # item['date'] = line['publishDate']
d = line['publishDate']
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['topic'] = topic item['topic'] = topic
item['title'] = line['name'] item['title'] = line['name']
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ): if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
......
import scrapy, json, re import scrapy, json, re
from datetime import datetime, date, timedelta, tzinfo
""" """
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
...@@ -17,6 +18,18 @@ def remove_tags(text): ...@@ -17,6 +18,18 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -31,11 +44,12 @@ class QuotesSpider(scrapy.Spider): ...@@ -31,11 +44,12 @@ class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
self.baseURL = 'https://www.lajornadamaya.mx' self.baseURL = 'https://www.lajornadamaya.mx'
# section_list = [('yucatan',123,'Yucatan'), ('quintana-roo',52,'Quintana Roo'), section_list = [('yucatan',123,'Yucatan'), ('quintana-roo',52,'Quintana Roo'),
# ('campeche',32,'Campeche'), ('opinion',0,'Opinion'), ('deportes',91,'Deportes'), ('campeche',32,'Campeche'), ('opinion',0,'Opinion'), ('deportes',91,'Deportes'),
# ('nacional',100,'Nacional'), ('internacional',87,'Internacional')] ('nacional',100,'Nacional'), ('internacional',87,'Internacional')]
section_list = [('opinion',0,'Opinion')] # section_list = [('opinion',0,'Opinion')]
for section in section_list: for section in section_list:
self.section = section self.section = section
...@@ -56,7 +70,20 @@ class QuotesSpider(scrapy.Spider): ...@@ -56,7 +70,20 @@ class QuotesSpider(scrapy.Spider):
for line in json_list: for line in json_list:
item = NoticiasItem() item = NoticiasItem()
item['date'] = line['publishDate']
d = line['publishDate']
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['topic'] = self.section[2] item['topic'] = self.section[2]
item['title'] = line['name'] item['title'] = line['name']
if not ( self.section[0] == 'opinion' ): if not ( self.section[0] == 'opinion' ):
......
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,41 +18,52 @@ class NoticiasItem(scrapy.Item): ...@@ -17,41 +18,52 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.page-nav').css('a.last::attr(href)').extract() pagination = response.css('div.page-nav').css('a.last::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[0].strip('/') pagination = pagination[0].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(0,pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract(): for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first() item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first()
item['date'] = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
d = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.td-post-header').css('a::text').extract_first() item['topic'] = response.css('div.td-post-header').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
......
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,46 +18,59 @@ class NoticiasItem(scrapy.Item): ...@@ -17,46 +18,59 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract(): for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first() item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
item['date'] = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first() item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
#print item['title'] #print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -21,28 +21,30 @@ class NoticiasItem(scrapy.Item): ...@@ -21,28 +21,30 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2) self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -55,14 +57,17 @@ class QuotesSpider(scrapy.Spider): ...@@ -55,14 +57,17 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first() item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first() item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first() item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
for paragraph in response.xpath('//div[@id="content-area"]/p').extract(): for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -21,16 +21,14 @@ class NoticiasItem(scrapy.Item): ...@@ -21,16 +21,14 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -42,13 +40,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -42,13 +40,16 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first() item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first() item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
item['author'] = response.xpath('//span[@itemprop="name"]/text()').extract_first() item['author'] = response.xpath('//span[@itemprop="name"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract(): for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -22,16 +21,14 @@ class NoticiasItem(scrapy.Item): ...@@ -22,16 +21,14 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -56,9 +53,10 @@ class QuotesSpider(scrapy.Spider): ...@@ -56,9 +53,10 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.entry-title::text').extract_first() item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract(): for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
......
import scrapy import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -26,11 +25,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -26,11 +25,8 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -55,7 +51,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -55,7 +51,13 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
if d[-6:] != '-05:00' and d[-6:] != '-06:00' :
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first() item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
......
...@@ -54,7 +54,12 @@ class QuotesSpider(scrapy.Spider): ...@@ -54,7 +54,12 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first() d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2] item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
item['title'] = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first() item['title'] = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
......
from scrapy.spidermiddlewares.httperror import HttpError from scrapy.spidermiddlewares.httperror import HttpError
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -77,10 +76,19 @@ class QuotesSpider(scrapy.Spider): ...@@ -77,10 +76,19 @@ class QuotesSpider(scrapy.Spider):
self.stop = False self.stop = False
page = 0 page = 0
# while not self.stop:
# # for page in range(0, 50):
# if page == 0:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
# elif page > 0:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1
while not self.stop: while not self.stop:
# for page in range(0, 50):
if page == 0: if page == 0:
yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http) yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
elif page > 0: elif page > 0:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http) yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
...@@ -92,7 +100,11 @@ class QuotesSpider(scrapy.Spider): ...@@ -92,7 +100,11 @@ class QuotesSpider(scrapy.Spider):
# print('**********hey, 404! TRUE!!!') # print('**********hey, 404! TRUE!!!')
# self.stop = True # self.stop = True
# else: # else:
for link in response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract(): link_list = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
if len(link_list) <= 0:
link_list = response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract()
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
...@@ -106,14 +118,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -106,14 +118,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if item['date'] is None or item['date'] == '': d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['date'] = response.xpath('//meta[@property="DC.date.issued"]/@content').extract_first() if d is None or d == '':
d = response.xpath('//meta[@property="DC.date.issued"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first() item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract(): for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
\ No newline at end of file
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item): ...@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
...@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider):
for url in urls: for url in urls:
yield scrapy.Request(url=url, callback=self.parse) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract() pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
...@@ -43,21 +47,32 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,21 +47,32 @@ class QuotesSpider(scrapy.Spider):
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('h3.entry-title').css('a::attr(href)').extract(): for link in response.css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first() item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
item['author'] = response.css('div.td-post-author-name').css('a::text').extract_first() item['author'] = response.css('div.td-post-author-name').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item): ...@@ -17,8 +18,10 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
...@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -30,6 +33,7 @@ class QuotesSpider(scrapy.Spider):
for url in urls: for url in urls:
yield scrapy.Request(url=url, callback=self.parse) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract() pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
...@@ -43,20 +47,30 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,20 +47,30 @@ class QuotesSpider(scrapy.Spider):
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract(): for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first() item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['url'] = response.url item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,6 +18,7 @@ class NoticiasItem(scrapy.Item): ...@@ -17,6 +18,7 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
...@@ -24,11 +26,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -24,11 +26,9 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract(): for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
...@@ -37,16 +37,25 @@ class QuotesSpider(scrapy.Spider): ...@@ -37,16 +37,25 @@ class QuotesSpider(scrapy.Spider):
next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first() next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
yield scrapy.Request(url=next_page, callback=self.parse) yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('h1.title::text').extract_first() item['title'] = response.css('h1.title::text').extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['author'] = response.css('span.author').css('a::text').extract_first() item['author'] = response.css('span.author').css('a::text').extract_first()
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract() item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
for paragraph in response.css('div.post-entry').css('p').extract(): for paragraph in response.css('div.post-entry').css('p').extract():
text += remove_tags(paragraph) text += remove_tags(paragraph)
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,32 +17,36 @@ class NoticiasItem(scrapy.Item): ...@@ -17,32 +17,36 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for post in response.css('div.mosaicflow').css('div.post'): for post in response.css('div.mosaicflow').css('div.post'):
item = NoticiasItem() item = NoticiasItem()
...@@ -50,16 +54,24 @@ class QuotesSpider(scrapy.Spider): ...@@ -50,16 +54,24 @@ class QuotesSpider(scrapy.Spider):
item['title'] = post.xpath('./h1/a/@title').extract_first() item['title'] = post.xpath('./h1/a/@title').extract_first()
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item) request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item request.meta['item'] = item
yield request yield request
def parse_item(self, response): def parse_item(self, response):
item = response.meta['item'] item = response.meta['item']
text = '' text = ''
item['date'] = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.single_text').css('p').extract(): for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title']
# print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -17,46 +17,60 @@ class NoticiasItem(scrapy.Item): ...@@ -17,46 +17,60 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract(): for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first() item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
item['date'] = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first() item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract(): for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -19,30 +19,34 @@ class NoticiasItem(scrapy.Item): ...@@ -19,30 +19,34 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1] pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.list-block').xpath('./h3/a/@href').extract(): for link in response.css('div.list-block').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
...@@ -51,12 +55,20 @@ class QuotesSpider(scrapy.Spider): ...@@ -51,12 +55,20 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first() item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2] item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2]
for paragraph in response.css('div.post_content').css('p').extract(): for paragraph in response.css('div.post_content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
import re from datetime import datetime, date, timedelta, tzinfo
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
""" """
...@@ -14,6 +14,18 @@ def remove_tags(text): ...@@ -14,6 +14,18 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para el centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -28,12 +40,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -28,12 +40,13 @@ class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
# day = getattr(self, 'day', None) # day = getattr(self, 'day', None)
parse_month = {'1': 'Enero', '2': 'Febrero', '3': 'Marzo', '4': 'Abril', parse_month = {'1': 'enero', '2': 'febrero', '3': 'marzo', '4': 'abril',
'5': 'Mayo', '6': 'Junio', '7': 'Julio', '8': 'Agosto', '5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'Septiembre', '10': 'Octubre', '11': 'Noviembre', '12': 'Diciembre'} '9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
self.date = parse_month[month]+' de '+year self.date = parse_month[month]+' de '+year
...@@ -46,6 +59,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -46,6 +59,7 @@ class QuotesSpider(scrapy.Spider):
for post in response.css('div.catpor-box'): for post in response.css('div.catpor-box'):
post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first() post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first()
post_date = post_date[post_date.find('d')+3:] post_date = post_date[post_date.find('d')+3:]
if post_date == self.date: if post_date == self.date:
link = post.xpath('./div/div/a/@href').extract_first() link = post.xpath('./div/div/a/@href').extract_first()
yield scrapy.Request(url=link, callback=self.parse_2) yield scrapy.Request(url=link, callback=self.parse_2)
...@@ -60,13 +74,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -60,13 +74,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
d = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first() item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first()
item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first() item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first()
for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract(): for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment