Commit 5953d008 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent ba65cc37
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class AlchileItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class AlchilePipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'alChile.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'alChile.spiders'
#USER_AGENT = 'alChile (+http://www.yourdomain.com)' #USER_AGENT = 'alChile (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'alChile.pipelines.AlchilePipeline': 300, 'alChile.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from alChile.items import NoticiasItem
""" """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Al Chile, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -9,16 +13,6 @@ def remove_tags(text): ...@@ -9,16 +13,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -26,26 +20,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -26,26 +20,22 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.page-nav').css('a.last::attr(href)').extract() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ):
pagination = pagination[0].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( page == 0 ): if len(pagination) > 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: for page in range(1,pages):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class CampechehoyItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class CampechehoyPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'campecheHoy.pipelines.CampechehoyPipeline': 300, 'campecheHoy.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from campecheHoy.items import NoticiasItem
""" """
MEDIO: MEDIO:
Campeche Hoy, Campeche Campeche Hoy, Campeche
USO: USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2018 -a month=1 -a day=17 scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
TRASH_RE = re.compile(r'<.*?>.*</.*?>\s?')
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class DesdeelbalconItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DesdeelbalconPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'desdeElBalcon.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
#USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)' #USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300, 'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem
""" """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Desde el Balcon, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -22,16 +26,6 @@ class UTC(tzinfo): ...@@ -22,16 +26,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -46,21 +40,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -46,21 +40,16 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if len(pagination) > 0: if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(1, pages):
if page == 0: yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class DiarioyaquiItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DiarioyaquiPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'diarioYaqui.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'diarioYaqui.spiders'
#USER_AGENT = 'diarioYaqui (+http://www.yourdomain.com)' #USER_AGENT = 'diarioYaqui (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'diarioYaqui.pipelines.DiarioyaquiPipeline': 300, 'diarioYaqui.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from diarioYaqui.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
Diario del Yaqui, Sonora
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -21,16 +27,6 @@ class UTC(tzinfo): ...@@ -21,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-7' return 'UTC-7'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -46,21 +42,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -46,21 +42,15 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ): pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(1,pages):
if ( page == 0 ): yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class GrilloportenoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class GrilloportenoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'grilloPorteno.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'grilloPorteno.spiders'
#USER_AGENT = 'grilloPorteno (+http://www.yourdomain.com)' #USER_AGENT = 'grilloPorteno (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'grilloPorteno.pipelines.GrilloportenoPipeline': 300, 'grilloPorteno.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from grilloPorteno.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
El Grillo, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider): self.baseURL='http://grilloporteno.com/' + year +'/' + month + '/' + day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None) def parse(self, response):
day = getattr(self, 'day', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day
pagination = response.css('div.pagination').css('a::attr(href)').extract()
yield scrapy.Request(url=self.baseURL, callback=self.parse) if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() for page in range(1, pages):
if ( len(pagination) > 0 ): yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): def parse_page(self, response):
if ( page == 0 ): for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=link, callback=self.parse_item)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) def parse_item(self, response):
item = NoticiasItem()
else: text = ''
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
def parse_page(self, response): ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract(): if d[-6:] != '-06:00':
yield scrapy.Request(url=link, callback=self.parse_item) d = d[:-6] + '-06:00'
item['date'] = d
def parse_item(self, response): item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
item = NoticiasItem()
text = '' for paragraph in response.css('div.entry-content').css('p').extract():
item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first() text += remove_tags(paragraph) + '\n'
item['text'] = text
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first() item['url'] = response.url
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00': #print item['title']
d = d[:-6] + '-06:00' yield item
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
#print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class HeraldoagsItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class HeraldoagsPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'heraldoAgs.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'heraldoAgs.spiders'
#USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)' #USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'heraldoAgs.pipelines.HeraldoagsPipeline': 300, 'heraldoAgs.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from heraldoAgs.items import NoticiasItem
""" """
MEDIO: MEDIO:
El Heraldo, Aguascalientes El Heraldo, Aguascalientes
USO: USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5 scrapy crawl noticias --nolog -s filename=2018-01-05.json -a year=2018 -a month=1 -a day=5
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -15,16 +16,6 @@ LOC_RE = re.compile(r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?') ...@@ -15,16 +16,6 @@ LOC_RE = re.compile(r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
DAT_RE = re.compile(r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?') DAT_RE = re.compile(r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadaItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders'
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)' #USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornada.pipelines.LajornadaPipeline': 300, 'laJornada.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict from collections import OrderedDict
""" """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
La Jornada, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -117,16 +121,6 @@ class UTC(tzinfo): ...@@ -117,16 +121,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadaagsItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaagsPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaAgs.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
#USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaAgs.pipelines.LajornadaagsPipeline': 300, 'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaAgs.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
La Jornada Aguascalientes, Ags.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider): self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None) def parse(self, response):
day = getattr(self, 'day', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
yield scrapy.Request(url=self.baseURL, callback=self.parse) if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) def parse_page(self, response):
for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():
for page in range(0, pages): yield scrapy.Request(url=link, callback=self.parse_item)
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) text = ''
## la fecha de la noticia ya incluye la zona horaria
else: item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
def parse_page(self, response):
for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract(): for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
yield scrapy.Request(url=link, callback=self.parse_item) text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
def parse_item(self, response):
item = NoticiasItem() # print item['title']
text = '' yield item
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first()
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadagroItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadagroPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGro.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGro.spiders'
#USER_AGENT = 'laJornadaGro (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaGro (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaGro.pipelines.LajornadagroPipeline': 300, 'laJornadaGro.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaGro.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
""" """
MEDIO:
La Jornada Guerrero, Gro.
Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
con url: 'http://www.lajornadaguerrero.com.mx' con url: 'http://www.lajornadaguerrero.com.mx'
Esta version tiene noticias a partir del 2017.08.15 Esta version descarga noticias a partir del 2017.08.15
Uso: Uso:
scrapy crawl noticias --nolog -s filename=2017-09-18.json -a year=2017 -a month=9 -a day=18
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=18
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt): def utcoffset(self, dt):
# nombre de la zona horaria # zona horaria para guerrero (centro de mexico): utc-6
return 'UTC-6' return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06', self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06',
'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'} 'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}
# self.baseURL = 'http://www.ljg.com.mx' # self.baseURL = 'http://www.ljg.com.mx'
self.baseURL = 'http://www.lajornadaguerrero.com.mx' self.baseURL = "http://www.lajornadaguerrero.com.mx"
url = '/index.php?option=com_k2&view=itemlist&task=date&year='+year+'&month='+month+'&day='+day+'&Itemid=588' url = "/index.php?option=com_k2&view=itemlist&task=date&year="+year+"&month="+month+"&day="+day+"&Itemid=588"
yield scrapy.Request(url=self.baseURL+url, callback=self.parse) yield scrapy.Request(url=self.baseURL+url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2] yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if len(pagination) > 0:
pagination.insert(0, response.url) pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
for page in range(0, len(pagination)): if len(pagination) > 0:
if page == 0: for page in range(0, len(pagination)):
yield scrapy.Request(url=pagination[page], callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)
elif page > 0:
yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page) def parse_page(self, response):
for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
else: yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_item(self, response):
def parse_page(self, response): item = NoticiasItem()
for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract(): path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item) text = ''
d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first()
def parse_item(self, response): if d is not None:
print response.url d = d.replace('\n','')
item = NoticiasItem() d = d.replace('\t','')
path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p'] d = d.replace(',','')
text = '' m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m])
d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first() # item['date'] = datetime.strptime(d, '%m %d %Y').date()
if d is not None: d = map(int, d.split(' '))
d = d.replace('\n','') item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
d = d.replace('\t','')
d = d.replace(',','') title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
m = d[:d.find(' ')] if title is not None:
d = d.replace(m, self.parse_month[m]) title = title.replace('\n','')
# item['date'] = datetime.strptime(d, '%m %d %Y').date() title = title.replace('\t','')
d = map(int, d.split(' ')) title = title.lstrip()
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T') title = title.rstrip()
item['title'] = title
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None: item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first()
title = title.replace('\n','')
title = title.replace('\t','') for path in path_list:
title = title.lstrip() for p in response.xpath(path).extract():
title = title.rstrip() text += remove_tags(p)
item['title'] = title item['text'] = text
item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first() item['url'] = response.url
for path in path_list: # print item['url']
for p in response.xpath(path).extract(): yield item
text += remove_tags(p)
item['text'] = text
item['url'] = response.url
# print item['url']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadagroantiguoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadagroantiguoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGroAntiguo.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGroAntiguo.spiders'
#USER_AGENT = 'laJornadaGroAntiguo (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaGroAntiguo (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaGroAntiguo.pipelines.LajornadagroantiguoPipeline': 300, 'laJornadaGroAntiguo.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaGroAntiguo.items import NoticiasItem
""" """
Esta version se encarga de la descarga de la pagina de La Jornada Guerrero Esta version se encarga de la descarga de la pagina de La Jornada Guerrero
con url: 'http://www.lajornadaguerrero.com.mx/' con url: 'http://www.lajornadaguerrero.com.mx/'
--> LA ESCTRUCTURA DE LA PAGINA HA CAMBIADO. VER CRAWLER laJornadaGro.
Uso: Uso:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.link_list = [] self.link_list = []
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2) self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)
urls = [ urls = [
self.baseURL, self.baseURL,
] ]
for url in urls: for url in urls:
yield scrapy.Request(url=url, callback=self.parse) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href', paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href',
'//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href', '//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href',
'//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href', '//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href',
'//div[@class="text_block_200"]/p/a/@href'] '//div[@class="text_block_200"]/p/a/@href']
for path in paths: for path in paths:
links = response.xpath(path).extract() links = response.xpath(path).extract()
if ( len(links) > 0 ): if ( len(links) > 0 ):
for link in links: for link in links:
if not ( link in self.link_list ): if not ( link in self.link_list ):
self.link_list.append(link) self.link_list.append(link)
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item) yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract(): for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href', paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href'] '//*[@id="article_list"]/h3/a/@href']
for path in paths: for path in paths:
for link in response.xpath(path).extract(): for link in response.xpath(path).extract():
if not ( link in self.link_list ): if not ( link in self.link_list ):
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item) yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first() item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract() item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract(): for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
text += paragraph text += paragraph
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadaoteItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaotePipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaOte.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaOte.spiders'
#USER_AGENT = 'laJornadaOte (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaOte (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaOte.pipelines.LajornadaotePipeline': 300, 'laJornadaOte.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaOte.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
La Jornada de Oriente, Puebla
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item): def start_requests(self):
title = scrapy.Field() year = getattr(self, 'year', None)
text = scrapy.Field() month = getattr(self, 'month', None)
date = scrapy.Field() day = getattr(self, 'day', None)
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
class QuotesSpider(scrapy.Spider): yield scrapy.Request(url=self.baseURL, callback=self.parse)
name = "noticias"
def start_requests(self): def parse(self, response):
year = getattr(self, 'year', None) for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
month = getattr(self, 'month', None) yield scrapy.Request(url=link, callback=self.parse_item)
day = getattr(self, 'day', None)
self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
def parse_item(self, response):
yield scrapy.Request(url=self.baseURL, callback=self.parse) item = NoticiasItem()
text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
def parse(self, response): ## la fecha de la noticia ya incluye la zona horaria
for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract(): item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
yield scrapy.Request(url=link, callback=self.parse_item) item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
def parse_item(self, response): text += remove_tags(paragraph) + '\n'
item = NoticiasItem() item['text'] = text
text = '' item['url'] = response.url
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria # print item['title']
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() yield item
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadasanluisItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadasanluisPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaSanLuis.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaSanLuis.spiders'
#USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaSanLuis.pipelines.LajornadasanluisPipeline': 300, 'laJornadaSanLuis.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaSanLuis.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
La Jornada de San Luis, San Luis Potosi
Uso:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider): self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None) def parse(self, response):
day = getattr(self, 'day', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
yield scrapy.Request(url=self.baseURL, callback=self.parse) if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
def parse(self, response): for page in range(1, pages):
pagination = response.xpath('//div[@class="pages"]/a/@href').extract() yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) def parse_page(self, response):
for page in range(0,pages): for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract():
if ( page == 0 ): yield scrapy.Request(url=link, callback=self.parse_item)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) text = ''
## la fecha de la noticia ya incluye la zona horaria
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
def parse_page(self, response): if d is None:
for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract(): d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
yield scrapy.Request(url=link, callback=self.parse_item) item['date'] = d
item['title'] = response.css('h1.entry-title::text').extract_first()
def parse_item(self, response):
item = NoticiasItem() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
text = ''
## la fecha de la noticia ya incluye la zona horaria for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() text += remove_tags(paragraph) + '\n'
if d is None: item['text'] = text
d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first() item['url'] = response.url
item['date'] = d # print item['title']
yield item
item['title'] = response.css('h1.entry-title::text').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadaverItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadaverPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaVer.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaVer.spiders'
#USER_AGENT = 'laJornadaVer (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaVer (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaVer.pipelines.LajornadaverPipeline': 300, 'laJornadaVer.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaVer.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
La Jornada de Veracruz, Ver.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
class NoticiasItem(scrapy.Item): def tzname(self, dt):
title = scrapy.Field() # nombre de la zona horaria
text = scrapy.Field() return 'UTC-6'
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
tz = UTC() tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T') self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
yield scrapy.Request(url=self.builtURL, callback=self.parse)
yield scrapy.Request(url=self.builtURL, callback=self.parse)
def parse(self, response):
paths = ['//*[@class="article-header"]/h2/a/@href', def parse(self, response):
'//ul[@class="article-array content-category"]/li/a/@href'] paths = ['//*[@class="article-header"]/h2/a/@href',
for path in paths: '//ul[@class="article-array content-category"]/li/a/@href']
links = response.xpath(path).extract()
if ( len(links) > 0 ): for path in paths:
for link in links: links = response.xpath(path).extract()
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item) if len(links) > 0:
for link in links:
# for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract(): yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
# for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
def parse_page(self, response):
paths = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href'] def parse_page(self, response):
for path in paths: paths = ['//*[@id="article_list"]/h2/a/@href',
for link in response.xpath(path).extract(): '//*[@id="article_list"]/h3/a/@href']
if not ( link in self.link_list ):
yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item) for path in paths:
for link in response.xpath(path).extract():
if not link in self.link_list:
def parse_item(self, response): yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
item = NoticiasItem()
text = ''
item['date'] = self.date def parse_item(self, response):
item = NoticiasItem()
title = response.xpath('//h2[@class="article-title"]/text()').extract_first() text = ''
title = title.replace('\r','') item['date'] = self.date
title = title.replace('\n','')
title = title.lstrip(' ') title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
title = title.rstrip(' ') title = title.replace('\r','')
item['title'] = title title = title.replace('\n','')
title = title.lstrip(' ')
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first() title = title.rstrip(' ')
topic = topic.replace('\r','') item['title'] = title
topic = topic.replace('\n','')
topic = topic.lstrip(' ') topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.rstrip(' ') topic = topic.replace('\r','')
item['topic'] = topic topic = topic.replace('\n','')
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first() topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract() item['topic'] = topic
if ( len(paragraph) > 0 ): # item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
for p in paragraph:
text += paragraph[3] paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
else: if len(paragraph) > 0:
for p in response.xpath('//*[@class="shortcode-content"]').extract(): for p in paragraph:
text += remove_tags(p) + '\n' text += paragraph[3]
else:
item['text'] = text for p in response.xpath('//*[@class="shortcode-content"]').extract():
item['url'] = response.url text += remove_tags(p) + '\n'
# print item['title']
yield item item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LajornadazacItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadazacPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaZac.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaZac.spiders'
#USER_AGENT = 'laJornadaZac (+http://www.yourdomain.com)' #USER_AGENT = 'laJornadaZac (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laJornadaZac.pipelines.LajornadazacPipeline': 300, 'laJornadaZac.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
from laJornadaZac.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
La Jornada Zacatecas, Zac.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item): def start_requests(self):
title = scrapy.Field() year = getattr(self, 'year', None)
text = scrapy.Field() month = getattr(self, 'month', None)
date = scrapy.Field() day = getattr(self, 'day', None)
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
self.baseURL = "http://ljz.mx/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider): yield scrapy.Request(url=self.baseURL, callback=self.parse)
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None) def parse(self, response):
month = getattr(self, 'month', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
day = getattr(self, 'day', None)
self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
yield scrapy.Request(url=self.baseURL, callback=self.parse) pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
def parse(self, response): for page in range(1,pages):
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract() yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) def parse_page(self, response):
for page in range(0,pages): for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
if ( page == 0 ): yield scrapy.Request(url=link, callback=self.parse_item)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) text = ''
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
def parse_page(self, response): ## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract(): if d[-6:] != '-06:00':
yield scrapy.Request(url=link, callback=self.parse_item) d = d[:-6] + '-06:00'
item['date'] = d
def parse_item(self, response): item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item = NoticiasItem() item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
text = ''
content = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first() if ( len(content) == 0 ):
## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico) content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract()
if d[-6:] != '-06:00': if ( len(content) == 0 ):
d = d[:-6] + '-06:00' content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
item['date'] = d if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first() for paragraph in content:
text += remove_tags(paragraph) + '\n'
content = response.xpath('//*[@class="entry-content clearfix"]/p').extract() item['text'] = text
if ( len(content) == 0 ): item['url'] = response.url
content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract() # print item['title']
if ( len(content) == 0 ): yield item
content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def __init__(self, filename):
self.filename = filename
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class MipuntodevistaPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'miPuntoDeVista.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'miPuntoDeVista.spiders'
#USER_AGENT = 'miPuntoDeVista (+http://www.yourdomain.com)' #USER_AGENT = 'miPuntoDeVista (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'miPuntoDeVista.pipelines.MipuntodevistaPipeline': 300, 'miPuntoDeVista.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re import scrapy, re
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 USO:
scrapy crawl noticias -t json --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.mipuntodevista.com.mx/'+year+'/'+month+'/'+day
urls = [ self.baseURL = 'http://www.mipuntodevista.com.mx/' + year + '/' + month + '/' + day
self.baseURL,
] yield scrapy.Request(url=self.baseURL, callback=self.parse)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract() pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(1, pages):
if ( page == 0 ): yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) def parse_page(self, response):
else: for link in response.css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_page(self, response): def parse_item(self, response):
for link in response.css('h3.entry-title').css('a::attr(href)').extract(): item = NoticiasItem()
yield scrapy.Request(url=link, callback=self.parse_item) text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
def parse_item(self, response):
item = NoticiasItem() d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
text = '' ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first() d = d[:-6] + '-06:00'
item['date'] = d
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) author = response.xpath('//*[@class="td-post-author-name"]/a/text()').extract_first()
if d[-6:] != '-06:00': if author is not None: item['author'] = author
d = d[:-6] + '-06:00' try:
item['date'] = d item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()[1]
except:
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first() item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + "\n"
item['text'] = text item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
# print item['title']
yield item
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment