Commit ea7ae846 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 54164b6d
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from alChile.items import NoticiasItem from alChile.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from campecheHoy.items import NoticiasItem from campecheHoy.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem from desdeElBalcon.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from diarioYaqui.items import NoticiasItem from diarioYaqui.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from grilloPorteno.items import NoticiasItem from grilloPorteno.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from heraldoAgs.items import NoticiasItem from heraldoAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornada.items import NoticiasItem from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time from datetime import date, datetime, timedelta, tzinfo, time
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaAgs.items import NoticiasItem from laJornadaAgs.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaGro.items import NoticiasItem from laJornadaGro.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaGroAntiguo.items import NoticiasItem from laJornadaGroAntiguo.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaOte.items import NoticiasItem from laJornadaOte.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaSanLuis.items import NoticiasItem from laJornadaSanLuis.items import NoticiasItem
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaVer.items import NoticiasItem from laJornadaVer.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laJornadaZac.items import NoticiasItem from laJornadaZac.items import NoticiasItem
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LarazonItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LarazonPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders'
#USER_AGENT = 'laRazon (+http://www.yourdomain.com)' #USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laRazon.pipelines.LarazonPipeline': 300, 'laRazon.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laRazon.items import NoticiasItem
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28 MEDIO:
''' La Razón de México, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider): self.baseURL = "https://www.razon.com.mx/" + year + "/" + month + "/" + day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None) def parse(self, response):
day = getattr(self, 'day', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
self.baseURL='https://www.razon.com.mx/'+year+'/'+month+'/'+day pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
pagination = pagination.strip('/')
yield scrapy.Request(url=self.baseURL, callback=self.parse) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse(self, response):
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
pagination = pagination.strip('/') def parse_page(self, response):
pages = int(pagination[pagination.rfind('/')+1:]) for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
for page in range(0,pages):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) text = ''
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
def parse_page(self, response): ## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract(): if d[-6:] != '-06:00':
yield scrapy.Request(url=link, callback=self.parse_item) d = d[:-6] + '-06:00'
item['date'] = d
def parse_item(self, response): item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
item = NoticiasItem() ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
text = '' if ti is None:
ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first() item['title'] = ti
## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
if d[-6:] != '-06:00': paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
d = d[:-6] + '-06:00' if len(paragraphs) <= 0:
item['date'] = d paragraphs = response.xpath('//*[@dir="auto"]').extract()
for p in paragraphs:
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2] text += remove_tags(p) + '\n'
ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
if ti is None: item['text'] = text
ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first() item['url'] = response.url
item['title'] = ti
# print item['title']
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract() yield item
if len(paragraphs) <= 0:
paragraphs = response.xpath('//*[@dir="auto"]').extract()
for p in paragraphs:
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LaverdadyucItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LaverdadyucPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders'
#USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)' #USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 2 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'laVerdadYuc.pipelines.LaverdadyucPipeline': 300, 'laVerdadYuc.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from laVerdadYuc.items import NoticiasItem
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
''' La Verdad Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# class QuotesSpider(scrapy.Spider): # class QuotesSpider(scrapy.Spider):
# name = "noticias" # name = "noticias"
...@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
self.baseURL = "http://laverdadnoticias.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class LectormxItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LectormxPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders'
#USER_AGENT = 'lectorMX (+http://www.yourdomain.com)' #USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'lectorMX.pipelines.LectormxPipeline': 300, 'lectorMX.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from lectorMX.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30 MEDIO:
Lector MX, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def utcoffset(self, dt): def tzname(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6 # nombre de la zona horaria
return timedelta(hours=-6) return 'UTC-6'
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item): def start_requests(self):
title = scrapy.Field() tz = UTC()
text = scrapy.Field() year = getattr(self, 'year', None)
date = scrapy.Field() month = getattr(self, 'month', None)
location = scrapy.Field() day = getattr(self, 'day', None)
author = scrapy.Field() self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
topic = scrapy.Field() self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
url = scrapy.Field()
yield scrapy.Request(url=self.baseURL, callback=self.parse)
class QuotesSpider(scrapy.Spider):
name = "noticias" def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def start_requests(self):
tz = UTC() pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
year = getattr(self, 'year', None) if len(pagination) > 0:
month = getattr(self, 'month', None) pagination = pagination[-1].strip('/')
day = getattr(self, 'day', None) pages = int(pagination[pagination.rfind('/')+1:])
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse_page(self, response):
def parse(self, response): for link in response.xpath('//h2[@class="title"]/a/@href').extract():
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract() yield scrapy.Request(url=link, callback=self.parse_item)
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) def parse_item(self, response):
for page in range(0, pages): text = ''
if ( page == 0 ): item = NoticiasItem()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item['date'] = self.date
else: item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
else: for paragraph in response.css('div.post-single-content').css('p').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
def parse_page(self, response): # print item['title']
for link in response.xpath('//h2[@class="title"]/a/@href').extract(): yield item
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
for paragraph in response.css('div.post-single-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class MipuntodevistaItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from miPuntoDeVista.items import NoticiasItem
""" """
MEDIO:
Mi Punto de Vista, Yucatán
USO: USO:
scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
""" """
...@@ -10,16 +14,6 @@ def remove_tags(text): ...@@ -10,16 +14,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class NotirivasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotirivasPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders'
#USER_AGENT = 'notirivas (+http://www.yourdomain.com)' #USER_AGENT = 'notirivas (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'notirivas.pipelines.NotirivasPipeline': 300, 'notirivas.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from notirivas.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Notirivas, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6 # zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria # nombre de la zona horaria
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
class QuotesSpider(scrapy.Spider): self.baseURL = "http://gruporivas.com.mx/notirivas/" + year + "/" + month + "/" +day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None) def parse(self, response):
month = getattr(self, 'month', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T') pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day if len(pagination) > 0:
pagination = pagination[0]
yield scrapy.Request(url=self.baseURL, callback=self.parse) pages = int(pagination[pagination.rfind(' ')+1:])
for page in range(1, pages):
def parse(self, response): yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
if ( len(pagination) > 0 ):
pagination = pagination[0] def parse_page(self, response):
pages = int(pagination[pagination.rfind(' ')+1:]) for link in response.xpath('//article/header/h2/a/@href').extract():
for page in range(0, pages): yield scrapy.Request(url=link, callback=self.parse_item)
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: def parse_item(self, response):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) item = NoticiasItem()
else: text = ''
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract(): content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
yield scrapy.Request(url=link, callback=self.parse_item) if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
if ( len(content) == 0 ):
def parse_item(self, response): content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
item = NoticiasItem()
text = '' for paragraph in content:
item['date'] = self.date text += remove_tags(paragraph) + '\n'
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first() item['text'] = text
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first() item['url'] = response.url
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract() # print item['title']
if ( len(content) == 0 ): yield item
content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
if ( len(content) == 0 ):
content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
for paragraph in content:
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class NotisuresteItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class NotisurestePipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders'
#USER_AGENT = 'notisureste (+http://www.yourdomain.com)' #USER_AGENT = 'notisureste (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'notisureste.pipelines.NotisurestePipeline': 300, 'notisureste.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from notisureste.items import NoticiasItem
"""
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 MEDIO:
Notisureste, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
class QuotesSpider(scrapy.Spider): self.baseURL = "http://www.notisureste.com/" + year + "/" + month + "/" + day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None) def parse(self, response):
day = getattr(self, 'day', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
self.baseURL='http://www.notisureste.com/'+year+'/'+month+'/'+day
urls = [ pagination = response.css('div.page-nav').css('a::attr(href)').extract()
self.baseURL, if len(pagination) > 0:
] pagination = pagination[-2].strip('/')
for url in urls: pages = int(pagination[pagination.rfind('/')+1:])
yield scrapy.Request(url=url, callback=self.parse)
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse(self, response):
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): def parse_page(self, response):
pagination = pagination[-2].strip('/') for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
pages = int(pagination[pagination.rfind('/')+1:]) yield scrapy.Request(url=link, callback=self.parse_item)
for page in range(0,int(pages)):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) text = ''
else: item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
def parse_page(self, response): if d[-6:] != '-06:00':
for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract(): d = d[:-6] + '-06:00'
yield scrapy.Request(url=link, callback=self.parse_item) item['date'] = d
item['url'] = response.url
def parse_item(self, response): item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
item = NoticiasItem()
text = '' for paragraph in response.css('div.td-post-content').css('p').extract():
item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first() text += remove_tags(paragraph) + '\n'
item['text'] = text
d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['url'] = response.url
item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
# print item['title']
yield item
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class PuntomedioItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class PuntomedioPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders'
#USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)' #USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'puntoMedio.pipelines.PuntomedioPipeline': 300, 'puntoMedio.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
from puntoMedio.items import NoticiasItem
import scrapy, re import scrapy, re
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28 MEDIO:
''' Punto Medio, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2018-09-28.json -a year=2017 -a month=9 -a day=28
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
self.baseURL = "http://www.puntomedio.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class Sona893Item(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class Sona893Pipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders'
#USER_AGENT = 'sona893 (+http://www.yourdomain.com)' #USER_AGENT = 'sona893 (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'sona893.pipelines.Sona893Pipeline': 300, 'sona893.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from sona893.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
Sona 89.3, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item): def start_requests(self):
title = scrapy.Field() year = getattr(self, 'year', None)
text = scrapy.Field() month = getattr(self, 'month', None)
date = scrapy.Field() day = getattr(self, 'day', None)
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
self.baseURL = "http://sona893.fm/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider): yield scrapy.Request(url=self.baseURL, callback=self.parse)
name = "noticias"
def start_requests(self): def parse(self, response):
year = getattr(self, 'year', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None) pagination = response.css('div.pagination').css('a::attr(href)').extract()
self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day if len(pagination) > 0:
pagination = pagination[-1].strip('/')
yield scrapy.Request(url=self.baseURL, callback=self.parse) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
def parse(self, response): yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') def parse_page(self, response):
pages = int(pagination[pagination.rfind('/')+1:]) for post in response.css('div.mosaicflow').css('div.post'):
item = NoticiasItem()
for page in range(0, pages): item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
if ( page == 0 ): item['title'] = post.xpath('./h1/a/@title').extract_first()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield request
else: def parse_item(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item = response.meta['item']
text = ''
def parse_page(self, response): d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
for post in response.css('div.mosaicflow').css('div.post'): ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
item = NoticiasItem() if d[-6:] != '-06:00':
item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first() d = d[:-6] + '-06:00'
item['title'] = post.xpath('./h1/a/@title').extract_first() item['date'] = d
request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n'
yield request item['text'] = text
item['url'] = response.url
def parse_item(self, response):
item = response.meta['item'] # print item['title']
text = '' yield item
d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.single_text').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class TribunacabosItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class TribunacabosPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'tribunaCabos.pipelines.TribunacabosPipeline': 300, 'tribunaCabos.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from tribunaCabos.items import NoticiasItem
""" """
MEDIO: MEDIO:
...@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-] ...@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]
DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}') DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
# def parseLocation(p, sign):
# p = p.split(sign)
# location = p[0].strip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
#
# p = " ".join(p)
# return p, location
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider):
p = LOC_RE.sub('', p) p = LOC_RE.sub('', p)
text += p + "\n" text += p + "\n"
# for i in range(0, len(bodyText)):
# p = remove_tags(bodyText[i])
# if i == 0:
# sign = u'.\u2013'
# limit = 35
# n = p.count(sign)
# if n == 0:
# sign = ".-"
# limit = 30
# n = p.count(sign)
# if n > 0 and len(p.split(sign)[0]) <= limit:
# loc = p.split(sign)
# if len(loc[0].split(",")[0]) <= 20:
# p = loc
# loc = p[0].split(",")
# item['location'] = loc[0].strip()
# # item['location'] = item['location'].rstrip()
# del p[0]
# for j in range(0, len(p)):
# p[j] = p[j].lstrip(" ")
# p[j] = p[j].rstrip(" ")
# p = " ".join(p)
#
# text += p + "\n"
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class UnomasunoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class UnomasunoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders'
#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)' #USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True ...@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'unoMasUno.pipelines.UnomasunoPipeline': 300, 'unoMasUno.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
import scrapy, re # -*- coding: utf-8 -*-
import scrapy, re, json
from unoMasUno.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
''' """
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22 MEDIO:
''' Uno Más Uno, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6 # zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria # nombre de la zona horaria
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item): class QuotesSpider(scrapy.Spider):
title = scrapy.Field() name = "noticias"
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
class QuotesSpider(scrapy.Spider): self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
name = "noticias"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None) def parse(self, response):
month = getattr(self, 'month', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8, if pagination is None:
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12} pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day if len(pagination) > 0:
pagination = pagination[-1].strip('/')
yield scrapy.Request(url=self.baseURL, callback=self.parse) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
def parse(self, response): yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
else:
if pagination is None: pagination = pagination.strip('/')
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract() pages = int(pagination[pagination.rfind('/')+1:])
if len(pagination) > 0: for page in range(1, pages):
pagination = pagination[-1].strip('/') yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): def parse_page(self, response):
if page == 0: for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=link, callback=self.parse_item)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
def parse_item(self, response):
else: item = NoticiasItem()
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) text = ''
else: try:
pagination = pagination.strip('/') jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
pages = int(pagination[pagination.rfind('/')+1:]) jsonInfo = json.loads(remove_tags(jsonInfo))
dat = jsonInfo['datePublished']
for page in range(0,pages): except:
if page == 0: try:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
else: d = d.replace(',', '').split(' ')
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
def parse_page(self, response): item['date'] = dat
for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
def parse_item(self, response): for p in response.xpath('//*[@class="entry"]/p').extract():
item = NoticiasItem() text += remove_tags(p) + '\n'
text = ''
item['text'] = text
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first() item['url'] = response.url
d = d.replace(',','').split(' ')
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat('T') # print item['title']
yield item
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
for p in response.xpath('//*[@class="entry"]/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class YucatanalamanoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanalamanoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
#USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)' #USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'yucatanALaMano.pipelines.YucatanalamanoPipeline': 300, 'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from yucatanALaMano.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """
MEDIO:
Yucatán a la Mano, Yuc.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
class NoticiasItem(scrapy.Item): def start_requests(self):
title = scrapy.Field() year = getattr(self, 'year', None)
text = scrapy.Field() month = getattr(self, 'month', None)
date = scrapy.Field() day = getattr(self, 'day', None)
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
class QuotesSpider(scrapy.Spider): yield scrapy.Request(url=self.baseURL, callback=self.parse)
name = "noticias"
def start_requests(self): def parse(self, response):
year = getattr(self, 'year', None) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None) pagination = response.css('div.pagination').css('a::attr(href)').extract()
self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day if len(pagination) > 0:
pagination = pagination[-1].strip('/')
yield scrapy.Request(url=self.baseURL, callback=self.parse) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
def parse(self, response): yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(pagination) > 0 ): def parse_page(self, response):
pagination = pagination[-1].strip('/') for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
pages = int(pagination[pagination.rfind('/')+1:]) yield scrapy.Request(url=link, callback=self.parse_item)
for page in range(0, pages):
if ( page == 0 ): def parse_item(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) item = NoticiasItem()
text = ''
else: item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
else: ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract(): item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
yield scrapy.Request(url=link, callback=self.parse_item)
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
def parse_item(self, response): item['text'] = text
item = NoticiasItem() item['url'] = response.url
text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first() # print item['title']
yield item
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class YucatanencortoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class YucatanencortoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders'
#USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)' #USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'yucatanEnCorto.pipelines.YucatanencortoPipeline': 300, 'yucatanEnCorto.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from yucatanEnCorto.items import NoticiasItem
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
""" """
Crawler descarga noticias desde el 2017.10.18 MEDIO:
Yucatán en Corto, Yuc.
Esta versión descarga noticias desde el 2017.10.18, por cambio en la URL del sitio.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
self.baseURL = "http://www.yucatanencorto.com/noticias/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract() def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/') pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
pages = int(pagination[pagination.rfind('/')+1:]) if len(pagination) > 0:
pagination = pagination[-2].strip('/')
for page in range(0, pages): pages = int(pagination[pagination.rfind('/')+1:])
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) for page in range(0, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
else: for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
def parse_page(self, response): item = NoticiasItem()
for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract(): text = ''
yield scrapy.Request(url=link, callback=self.parse_item) title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
if title is None:
def parse_item(self, response): title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
item = NoticiasItem() item['title'] = title
text = ''
title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first() d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
if title is None: ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first() if d[-6:] != '-06:00':
item['title'] = title d = d[:-6] + '-06:00'
item['date'] = d
d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00' paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
item['date'] = d if len(paragraphs) <= 2:
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1] for p in paragraphs:
text += remove_tags(p) + '\n'
paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract() item['text'] = text
if len(paragraphs) <= 2:
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract() item['url'] = response.url
for p in paragraphs:
text += remove_tags(p) + '\n' # print item['title']
item['text'] = text yield item
item['url'] = response.url
# print item['title']
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment