Commit 5953d008 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent ba65cc37
......@@ -8,7 +8,13 @@
import scrapy
class AlchileItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class AlchilePipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'alChile.spiders'
#USER_AGENT = 'alChile (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'alChile.pipelines.AlchilePipeline': 300,
#}
ITEM_PIPELINES = {
'alChile.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from alChile.items import NoticiasItem
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
MEDIO:
Al Chile, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
......@@ -9,16 +13,6 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -26,26 +20,22 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.page-nav').css('a.last::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[0].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
for page in range(1,pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
......
......@@ -8,7 +8,13 @@
import scrapy
class CampechehoyItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class CampechehoyPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'campecheHoy.pipelines.CampechehoyPipeline': 300,
#}
ITEM_PIPELINES = {
'campecheHoy.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from campecheHoy.items import NoticiasItem
"""
MEDIO:
Campeche Hoy, Campeche
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2018 -a month=1 -a day=17
scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
TRASH_RE = re.compile(r'<.*?>.*</.*?>\s?')
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......
......@@ -8,7 +8,13 @@
import scrapy
class DesdeelbalconItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DesdeelbalconPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
#USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300,
#}
ITEM_PIPELINES = {
'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
MEDIO:
Desde el Balcon, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
......@@ -22,16 +26,6 @@ class UTC(tzinfo):
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -46,21 +40,16 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
......
......@@ -8,7 +8,13 @@
import scrapy
class DiarioyaquiItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DiarioyaquiPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'diarioYaqui.spiders'
#USER_AGENT = 'diarioYaqui (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'diarioYaqui.pipelines.DiarioyaquiPipeline': 300,
#}
ITEM_PIPELINES = {
'diarioYaqui.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from diarioYaqui.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
Diario del Yaqui, Sonora
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
......@@ -21,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-7'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -46,21 +42,15 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
for page in range(1,pages):
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
......
......@@ -8,7 +8,13 @@
import scrapy
class GrilloportenoItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class GrilloportenoPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'grilloPorteno.spiders'
#USER_AGENT = 'grilloPorteno (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'grilloPorteno.pipelines.GrilloportenoPipeline': 300,
#}
ITEM_PIPELINES = {
'grilloPorteno.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from grilloPorteno.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
El Grillo, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
#print item['title']
yield item
self.baseURL='http://grilloporteno.com/' + year +'/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
#print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class HeraldoagsItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class HeraldoagsPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'heraldoAgs.spiders'
#USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'heraldoAgs.pipelines.HeraldoagsPipeline': 300,
#}
ITEM_PIPELINES = {
'heraldoAgs.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from heraldoAgs.items import NoticiasItem
"""
MEDIO:
El Heraldo, Aguascalientes
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5
scrapy crawl noticias --nolog -s filename=2018-01-05.json -a year=2018 -a month=1 -a day=5
"""
TAG_RE = re.compile(r'<[^>]+>')
......@@ -15,16 +16,6 @@ LOC_RE = re.compile(r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
DAT_RE = re.compile(r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......
......@@ -8,7 +8,13 @@
import scrapy
class LajornadaItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders'
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornada.pipelines.LajornadaPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornada.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
MEDIO:
La Jornada, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
......@@ -117,16 +121,6 @@ class UTC(tzinfo):
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......
......@@ -8,7 +8,13 @@
import scrapy
class LajornadaagsItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaagsPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
#USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaAgs.pipelines.LajornadaagsPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaAgs.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
La Jornada Aguascalientes, Ags.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadagroItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadagroPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGro.spiders'
#USER_AGENT = 'laJornadaGro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaGro.pipelines.LajornadagroPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaGro.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaGro.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
La Jornada Guerrero, Gro.
Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
con url: 'http://www.lajornadaguerrero.com.mx'
Esta version tiene noticias a partir del 2017.08.15
Esta version descarga noticias a partir del 2017.08.15
Uso:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=18
scrapy crawl noticias --nolog -s filename=2017-09-18.json -a year=2017 -a month=9 -a day=18
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
"""clase para el 'time zone' (zona horaria)"""
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06',
'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}
# self.baseURL = 'http://www.ljg.com.mx'
self.baseURL = 'http://www.lajornadaguerrero.com.mx'
url = '/index.php?option=com_k2&view=itemlist&task=date&year='+year+'&month='+month+'&day='+day+'&Itemid=588'
yield scrapy.Request(url=self.baseURL+url, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
if len(pagination) > 0:
pagination.insert(0, response.url)
for page in range(0, len(pagination)):
if page == 0:
yield scrapy.Request(url=pagination[page], callback=self.parse_page, dont_filter=True)
elif page > 0:
yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item)
def parse_item(self, response):
print response.url
item = NoticiasItem()
path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
text = ''
d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first()
if d is not None:
d = d.replace('\n','')
d = d.replace('\t','')
d = d.replace(',','')
m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m])
# item['date'] = datetime.strptime(d, '%m %d %Y').date()
d = map(int, d.split(' '))
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None:
title = title.replace('\n','')
title = title.replace('\t','')
title = title.lstrip()
title = title.rstrip()
item['title'] = title
item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first()
for path in path_list:
for p in response.xpath(path).extract():
text += remove_tags(p)
item['text'] = text
item['url'] = response.url
# print item['url']
yield item
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06',
'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}
# self.baseURL = 'http://www.ljg.com.mx'
self.baseURL = "http://www.lajornadaguerrero.com.mx"
url = "/index.php?option=com_k2&view=itemlist&task=date&year="+year+"&month="+month+"&day="+day+"&Itemid=588"
yield scrapy.Request(url=self.baseURL+url, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
if len(pagination) > 0:
for page in range(0, len(pagination)):
yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)
def parse_page(self, response):
for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
text = ''
d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first()
if d is not None:
d = d.replace('\n','')
d = d.replace('\t','')
d = d.replace(',','')
m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m])
# item['date'] = datetime.strptime(d, '%m %d %Y').date()
d = map(int, d.split(' '))
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None:
title = title.replace('\n','')
title = title.replace('\t','')
title = title.lstrip()
title = title.rstrip()
item['title'] = title
item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first()
for path in path_list:
for p in response.xpath(path).extract():
text += remove_tags(p)
item['text'] = text
item['url'] = response.url
# print item['url']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadagroantiguoItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadagroantiguoPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGroAntiguo.spiders'
#USER_AGENT = 'laJornadaGroAntiguo (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaGroAntiguo.pipelines.LajornadagroantiguoPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaGroAntiguo.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaGroAntiguo.items import NoticiasItem
"""
Esta version se encarga de la descarga de la pagina de La Jornada Guerrero
con url: 'http://www.lajornadaguerrero.com.mx/'
--> LA ESCTRUCTURA DE LA PAGINA HA CAMBIADO. VER CRAWLER laJornadaGro.
Uso:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.link_list = []
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href',
'//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href',
'//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href',
'//div[@class="text_block_200"]/p/a/@href']
for path in paths:
links = response.xpath(path).extract()
if ( len(links) > 0 ):
for link in links:
if not ( link in self.link_list ):
self.link_list.append(link)
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in paths:
for link in response.xpath(path).extract():
if not ( link in self.link_list ):
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
text += paragraph
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
name = "noticias"
def start_requests(self):
self.link_list = []
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href',
'//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href',
'//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href',
'//div[@class="text_block_200"]/p/a/@href']
for path in paths:
links = response.xpath(path).extract()
if ( len(links) > 0 ):
for link in links:
if not ( link in self.link_list ):
self.link_list.append(link)
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in paths:
for link in response.xpath(path).extract():
if not ( link in self.link_list ):
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
text += paragraph
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadaoteItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaotePipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaOte.spiders'
#USER_AGENT = 'laJornadaOte (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaOte.pipelines.LajornadaotePipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaOte.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaOte.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
La Jornada de Oriente, Puebla
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadasanluisItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadasanluisPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaSanLuis.spiders'
#USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaSanLuis.pipelines.LajornadasanluisPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaSanLuis.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaSanLuis.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
La Jornada de San Luis, San Luis Potosi
Uso:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
item['date'] = d
item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
item['date'] = d
item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadaverItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaverPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaVer.spiders'
#USER_AGENT = 'laJornadaVer (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaVer.pipelines.LajornadaverPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaVer.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaVer.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
La Jornada de Veracruz, Ver.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
yield scrapy.Request(url=self.builtURL, callback=self.parse)
def parse(self, response):
paths = ['//*[@class="article-header"]/h2/a/@href',
'//ul[@class="article-array content-category"]/li/a/@href']
for path in paths:
links = response.xpath(path).extract()
if ( len(links) > 0 ):
for link in links:
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
# for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in paths:
for link in response.xpath(path).extract():
if not ( link in self.link_list ):
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
title = title.replace('\r','')
title = title.replace('\n','')
title = title.lstrip(' ')
title = title.rstrip(' ')
item['title'] = title
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.replace('\r','')
topic = topic.replace('\n','')
topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
item['topic'] = topic
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
if ( len(paragraph) > 0 ):
for p in paragraph:
text += paragraph[3]
else:
for p in response.xpath('//*[@class="shortcode-content"]').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
yield scrapy.Request(url=self.builtURL, callback=self.parse)
def parse(self, response):
paths = ['//*[@class="article-header"]/h2/a/@href',
'//ul[@class="article-array content-category"]/li/a/@href']
for path in paths:
links = response.xpath(path).extract()
if len(links) > 0:
for link in links:
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
# for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in paths:
for link in response.xpath(path).extract():
if not link in self.link_list:
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
title = title.replace('\r','')
title = title.replace('\n','')
title = title.lstrip(' ')
title = title.rstrip(' ')
item['title'] = title
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.replace('\r','')
topic = topic.replace('\n','')
topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
item['topic'] = topic
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
if len(paragraph) > 0:
for p in paragraph:
text += paragraph[3]
else:
for p in response.xpath('//*[@class="shortcode-content"]').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LajornadazacItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadazacPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaZac.spiders'
#USER_AGENT = 'laJornadaZac (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaZac.pipelines.LajornadazacPipeline': 300,
#}
ITEM_PIPELINES = {
'laJornadaZac.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
from laJornadaZac.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
La Jornada Zacatecas, Zac.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = "http://ljz.mx/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
content = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1,pages):
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
content = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def __init__(self, filename):
self.filename = filename
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class MipuntodevistaPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'miPuntoDeVista.spiders'
#USER_AGENT = 'miPuntoDeVista (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'miPuntoDeVista.pipelines.MipuntodevistaPipeline': 300,
#}
ITEM_PIPELINES = {
'miPuntoDeVista.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
USO:
scrapy crawl noticias -t json --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.mipuntodevista.com.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://www.mipuntodevista.com.mx/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
author = response.xpath('//*[@class="td-post-author-name"]/a/text()').extract_first()
if author is not None: item['author'] = author
try:
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()[1]
except:
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment