Commit ea7ae846 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 54164b6d
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from alChile.items import NoticiasItem from alChile.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from campecheHoy.items import NoticiasItem from campecheHoy.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem from desdeElBalcon.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from diarioYaqui.items import NoticiasItem from diarioYaqui.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from grilloPorteno.items import NoticiasItem from grilloPorteno.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from heraldoAgs.items import NoticiasItem from heraldoAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornada.items import NoticiasItem from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time from datetime import date, datetime, timedelta, tzinfo, time
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaAgs.items import NoticiasItem from laJornadaAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaGro.items import NoticiasItem from laJornadaGro.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaGroAntiguo.items import NoticiasItem from laJornadaGroAntiguo.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaOte.items import NoticiasItem from laJornadaOte.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaSanLuis.items import NoticiasItem from laJornadaSanLuis.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaVer.items import NoticiasItem from laJornadaVer.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaZac.items import NoticiasItem from laJornadaZac.items import NoticiasItem
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LarazonItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LarazonPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders'
#USER_AGENT = 'laRazon (+http://www.yourdomain.com)' #USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laRazon.pipelines.LarazonPipeline': 300, 'laRazon.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laRazon.items import NoticiasItem
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28 MEDIO:
''' La Razón de México, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -10,16 +15,6 @@ def remove_tags(text): ...@@ -10,16 +15,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -28,20 +23,18 @@ class QuotesSpider(scrapy.Spider): ...@@ -28,20 +23,18 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='https://www.razon.com.mx/'+year+'/'+month+'/'+day self.baseURL = "https://www.razon.com.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first() pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
pagination = pagination.strip('/') pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
for page in range(0,pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LaverdadyucItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LaverdadyucPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders'
#USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)' #USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 2 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laVerdadYuc.pipelines.LaverdadyucPipeline': 300, 'laVerdadYuc.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laVerdadYuc.items import NoticiasItem
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
''' La Verdad Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# class QuotesSpider(scrapy.Spider): # class QuotesSpider(scrapy.Spider):
# name = "noticias" # name = "noticias"
...@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
self.baseURL = "http://laverdadnoticias.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LectormxItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LectormxPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders'
#USER_AGENT = 'lectorMX (+http://www.yourdomain.com)' #USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'lectorMX.pipelines.LectormxPipeline': 300, 'lectorMX.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from lectorMX.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30 MEDIO:
Lector MX, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -22,16 +27,6 @@ class UTC(tzinfo): ...@@ -22,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -47,17 +42,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -47,17 +42,15 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract() pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ): for page in range(1, pages):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class MipuntodevistaItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from miPuntoDeVista.items import NoticiasItem
""" """
MEDIO:
Mi Punto de Vista, Yucatán
USO: USO:
scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
""" """
...@@ -10,16 +14,6 @@ def remove_tags(text): ...@@ -10,16 +14,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class NotirivasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotirivasPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders'
#USER_AGENT = 'notirivas (+http://www.yourdomain.com)' #USER_AGENT = 'notirivas (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'notirivas.pipelines.NotirivasPipeline': 300, 'notirivas.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from notirivas.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Notirivas, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -22,16 +27,6 @@ class UTC(tzinfo): ...@@ -22,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -41,23 +36,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -41,23 +36,22 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T') self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
self.baseURL = "http://gruporivas.com.mx/notirivas/" + year + "/" + month + "/" +day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract() pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[0] pagination = pagination[0]
pages = int(pagination[pagination.rfind(' ')+1:]) pages = int(pagination[pagination.rfind(' ')+1:])
for page in range(0, pages):
if ( page == 0 ): for page in range(1, pages):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class NotisuresteItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotisurestePipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders'
#USER_AGENT = 'notisureste (+http://www.yourdomain.com)' #USER_AGENT = 'notisureste (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'notisureste.pipelines.NotisurestePipeline': 300, 'notisureste.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from notisureste.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Notisureste, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -26,26 +21,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -26,26 +21,22 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.notisureste.com/'+year+'/'+month+'/'+day
urls = [ self.baseURL = "http://www.notisureste.com/" + year + "/" + month + "/" + day
self.baseURL,
] yield scrapy.Request(url=self.baseURL, callback=self.parse)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').css('a::attr(href)').extract() pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,int(pages)):
if ( page == 0 ): for page in range(1, pages):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
...@@ -73,4 +64,3 @@ class QuotesSpider(scrapy.Spider): ...@@ -73,4 +64,3 @@ class QuotesSpider(scrapy.Spider):
# print item['title'] # print item['title']
yield item yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class PuntomedioItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class PuntomedioPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders'
#USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)' #USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'puntoMedio.pipelines.PuntomedioPipeline': 300, 'puntoMedio.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
from puntoMedio.items import NoticiasItem
import scrapy, re import scrapy, re
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28 MEDIO:
''' Punto Medio, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2018-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
self.baseURL = "http://www.puntomedio.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class Sona893Item(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class Sona893Pipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders'
#USER_AGENT = 'sona893 (+http://www.yourdomain.com)' #USER_AGENT = 'sona893 (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'sona893.pipelines.Sona893Pipeline': 300, 'sona893.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from sona893.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
Sona 89.3, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -25,27 +21,23 @@ class QuotesSpider(scrapy.Spider): ...@@ -25,27 +21,23 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day
self.baseURL = "http://sona893.fm/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(1, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for post in response.css('div.mosaicflow').css('div.post'): for post in response.css('div.mosaicflow').css('div.post'):
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class TribunacabosItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class TribunacabosPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'tribunaCabos.pipelines.TribunacabosPipeline': 300, 'tribunaCabos.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from tribunaCabos.items import NoticiasItem
""" """
MEDIO: MEDIO:
...@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-] ...@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]
DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}') DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
# def parseLocation(p, sign):
# p = p.split(sign)
# location = p[0].strip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
#
# p = " ".join(p)
# return p, location
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider):
p = LOC_RE.sub('', p) p = LOC_RE.sub('', p)
text += p + "\n" text += p + "\n"
# for i in range(0, len(bodyText)):
# p = remove_tags(bodyText[i])
# if i == 0:
# sign = u'.\u2013'
# limit = 35
# n = p.count(sign)
# if n == 0:
# sign = ".-"
# limit = 30
# n = p.count(sign)
# if n > 0 and len(p.split(sign)[0]) <= limit:
# loc = p.split(sign)
# if len(loc[0].split(",")[0]) <= 20:
# p = loc
# loc = p[0].split(",")
# item['location'] = loc[0].strip()
# # item['location'] = item['location'].rstrip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
# p = " ".join(p)
#
# text += p + "\n"
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class UnomasunoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class UnomasunoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders'
#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)' #USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'unoMasUno.pipelines.UnomasunoPipeline': 300, 'unoMasUno.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re # -*- coding: utf-8 -*-
import scrapy, re, json
from unoMasUno.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22 MEDIO:
''' Uno Más Uno, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -23,36 +28,27 @@ class UTC(tzinfo): ...@@ -23,36 +28,27 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8, 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12} 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
if pagination is None: if pagination is None:
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract() pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
...@@ -60,24 +56,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -60,24 +56,15 @@ class QuotesSpider(scrapy.Spider):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(1, pages):
if page == 0: yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
pagination = pagination.strip('/') pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(1, pages):
if page == 0: yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
...@@ -89,9 +76,18 @@ class QuotesSpider(scrapy.Spider): ...@@ -89,9 +76,18 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
try:
jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
jsonInfo = json.loads(remove_tags(jsonInfo))
dat = jsonInfo['datePublished']
except:
try:
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first() d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',','').split(' ') d = d.replace(',', '').split(' ')
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat('T') dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = dat
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1] item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first() item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class YucatanalamanoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanalamanoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
#USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)' #USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'yucatanALaMano.pipelines.YucatanalamanoPipeline': 300, 'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from yucatanALaMano.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
Yucatán a la Mano, Yuc.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -25,28 +21,23 @@ class QuotesSpider(scrapy.Spider): ...@@ -25,28 +21,23 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day
self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ): pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(1, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract(): for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class YucatanencortoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanencortoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders'
#USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)' #USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'yucatanEnCorto.pipelines.YucatanencortoPipeline': 300, 'yucatanEnCorto.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from yucatanEnCorto.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
""" """
Crawler descarga noticias desde el 2017.10.18 MEDIO:
Yucatán en Corto, Yuc.
Esta versión descarga noticias desde el 2017.10.18, por cambio en la URL del sitio.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -27,28 +22,23 @@ class QuotesSpider(scrapy.Spider): ...@@ -27,28 +22,23 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
self.baseURL = "http://www.yucatanencorto.com/noticias/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract() yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ): pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract(): for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment