Commit ea7ae846 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 54164b6d
# -*- coding: utf-8 -*-
import scrapy, re
from alChile.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from campecheHoy.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from diarioYaqui.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re
from grilloPorteno.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from heraldoAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaGro.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaGroAntiguo.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaOte.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaSanLuis.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaVer.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaZac.items import NoticiasItem
......
......@@ -8,7 +8,13 @@
import scrapy
class LarazonItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LarazonPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders'
#USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laRazon.pipelines.LarazonPipeline': 300,
#}
ITEM_PIPELINES = {
'laRazon.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from laRazon.items import NoticiasItem
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
'''
"""
MEDIO:
La Razón de México, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='https://www.razon.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
if ti is None:
ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
item['title'] = ti
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
if len(paragraphs) <= 0:
paragraphs = response.xpath('//*[@dir="auto"]').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
self.baseURL = "https://www.razon.com.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
if ti is None:
ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
item['title'] = ti
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
if len(paragraphs) <= 0:
paragraphs = response.xpath('//*[@dir="auto"]').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class LaverdadyucItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LaverdadyucPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders'
#USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laVerdadYuc.pipelines.LaverdadyucPipeline': 300,
#}
ITEM_PIPELINES = {
'laVerdadYuc.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from laVerdadYuc.items import NoticiasItem
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
'''
"""
MEDIO:
La Verdad Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# class QuotesSpider(scrapy.Spider):
# name = "noticias"
......@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
self.baseURL = "http://laverdadnoticias.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
......@@ -8,7 +8,13 @@
import scrapy
class LectormxItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LectormxPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders'
#USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'lectorMX.pipelines.LectormxPipeline': 300,
#}
ITEM_PIPELINES = {
'lectorMX.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from lectorMX.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
"""
MEDIO:
Lector MX, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
for paragraph in response.css('div.post-single-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
for paragraph in response.css('div.post-single-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class MipuntodevistaItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy, re
from miPuntoDeVista.items import NoticiasItem
"""
MEDIO:
Mi Punto de Vista, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
"""
......@@ -10,16 +14,6 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......
......@@ -8,7 +8,13 @@
import scrapy
class NotirivasItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotirivasPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders'
#USER_AGENT = 'notirivas (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'notirivas.pipelines.NotirivasPipeline': 300,
#}
ITEM_PIPELINES = {
'notirivas.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from notirivas.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
Notirivas, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
if ( len(pagination) > 0 ):
pagination = pagination[0]
pages = int(pagination[pagination.rfind(' ')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
self.baseURL = "http://gruporivas.com.mx/notirivas/" + year + "/" + month + "/" +day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
if len(pagination) > 0:
pagination = pagination[0]
pages = int(pagination[pagination.rfind(' ')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class NotisuresteItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotisurestePipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders'
#USER_AGENT = 'notisureste (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'notisureste.pipelines.NotisurestePipeline': 300,
#}
ITEM_PIPELINES = {
'notisureste.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from notisureste.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
Notisureste, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.notisureste.com/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,int(pages)):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
# print item['title']
yield item
self.baseURL = "http://www.notisureste.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class PuntomedioItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class PuntomedioPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders'
#USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'puntoMedio.pipelines.PuntomedioPipeline': 300,
#}
ITEM_PIPELINES = {
'puntoMedio.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
from puntoMedio.items import NoticiasItem
import scrapy, re
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
'''
"""
MEDIO:
Punto Medio, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2018-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
self.baseURL = "http://www.puntomedio.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
......@@ -8,7 +8,13 @@
import scrapy
class Sona893Item(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class Sona893Pipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders'
#USER_AGENT = 'sona893 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'sona893.pipelines.Sona893Pipeline': 300,
#}
ITEM_PIPELINES = {
'sona893.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from sona893.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
Sona 89.3, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = "http://sona893.fm/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for post in response.css('div.mosaicflow').css('div.post'):
item = NoticiasItem()
item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
item['title'] = post.xpath('./h1/a/@title').extract_first()
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for post in response.css('div.mosaicflow').css('div.post'):
item = NoticiasItem()
item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
item['title'] = post.xpath('./h1/a/@title').extract_first()
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class TribunacabosItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class TribunacabosPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'tribunaCabos.pipelines.TribunacabosPipeline': 300,
#}
ITEM_PIPELINES = {
'tribunaCabos.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from tribunaCabos.items import NoticiasItem
"""
MEDIO:
......@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]
DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
# def parseLocation(p, sign):
# p = p.split(sign)
# location = p[0].strip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
#
# p = " ".join(p)
# return p, location
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider):
p = LOC_RE.sub('', p)
text += p + "\n"
# for i in range(0, len(bodyText)):
# p = remove_tags(bodyText[i])
# if i == 0:
# sign = u'.\u2013'
# limit = 35
# n = p.count(sign)
# if n == 0:
# sign = ".-"
# limit = 30
# n = p.count(sign)
# if n > 0 and len(p.split(sign)[0]) <= limit:
# loc = p.split(sign)
# if len(loc[0].split(",")[0]) <= 20:
# p = loc
# loc = p[0].split(",")
# item['location'] = loc[0].strip()
# # item['location'] = item['location'].rstrip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
# p = " ".join(p)
#
# text += p + "\n"
item['text'] = text
item['url'] = response.url
......
......@@ -8,7 +8,13 @@
import scrapy
class UnomasunoItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class UnomasunoPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders'
#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'unoMasUno.pipelines.UnomasunoPipeline': 300,
#}
ITEM_PIPELINES = {
'unoMasUno.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re
# -*- coding: utf-8 -*-
import scrapy, re, json
from unoMasUno.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
'''
"""
MEDIO:
Uno Más Uno, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
if pagination is None:
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',','').split(' ')
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
for p in response.xpath('//*[@class="entry"]/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
if pagination is None:
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
try:
jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
jsonInfo = json.loads(remove_tags(jsonInfo))
dat = jsonInfo['datePublished']
except:
try:
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',', '').split(' ')
dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = dat
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
for p in response.xpath('//*[@class="entry"]/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class YucatanalamanoItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanalamanoPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
#USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'yucatanALaMano.pipelines.YucatanalamanoPipeline': 300,
#}
ITEM_PIPELINES = {
'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
from yucatanALaMano.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
MEDIO:
Yucatán a la Mano, Yuc.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -8,7 +8,13 @@
import scrapy
class YucatanencortoItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanencortoPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders'
#USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'yucatanEnCorto.pipelines.YucatanencortoPipeline': 300,
#}
ITEM_PIPELINES = {
'yucatanEnCorto.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
from yucatanEnCorto.items import NoticiasItem
"""
Crawler descarga noticias desde el 2017.10.18
MEDIO:
Yucatán en Corto, Yuc.
Esta versión descarga noticias desde el 2017.10.18, por cambio en la URL del sitio.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
if title is None:
title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
item['title'] = title
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
if len(paragraphs) <= 2:
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = "http://www.yucatanencorto.com/noticias/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
if title is None:
title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
item['title'] = title
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
if len(paragraphs) <= 2:
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment