Commit fc191b5c authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 60315988
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
twitter = scrapy.Field()
email = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariocolatinoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariocolatinoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("twitter", item['twitter']))
except:
pass
try:
row.append(("email", item['email']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioCoLatino project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioCoLatino'
SPIDER_MODULES = ['diarioCoLatino.spiders']
NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioCoLatino.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from diarioCoLatino.items import NoticiasItem
"""
MEDIO:
Diario Co Latino, El Salvador
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
item['topic'] = None
for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n"
if text == '':
for p in response.xpath('//div[@class="entry"]/div/span').extract():
text += remove_tags(p) + "\n"
text = "\n" + text
""" Obtiene autor """
res = AUTH_RE.match(text)
if res:
m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina twitter """
res = TW_RE.search(text)
if res:
m = res.group(0)
item['twitter'] = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Obtiene lugar """
res = LOC_RE.match(text)
if res:
m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
else:
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina correo """
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
text = text.replace("\n@Diario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nDiario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip()
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioCoLatino.settings
[deploy]
#url = http://localhost:6800/
project = diarioCoLatino
[{"date": "2018-02-23T15:00:46-06:00", "topic": "Conservation", "title": "Island students learn about the MOU for cultural protection and preservation", "text": "A small delegation from the Public Affairs section of the United States Embassy in Belize and personnel from the Institute of Archeology, met with students from the San Pedro Junior College on Wednesday, February 21st, to host a presentation on the upcoming renewal of the Memorandum of Understanding (MOU) between both countries addressing cultural protection and preservation. Hosting the presentation was Dr. Andrew Cohen, Senior Cultural Property Analyst, from the Bureau of Educational and Cultural Affairs at the U.S Department of State. He discussed the key points of the MOU, which aims to continue combating the looting of Belize\u2019s archaeological sites and create awareness for the preservation of the country\u2019s historical heritage for future generations.\nThe group of officials was hosted by Founder/Chairman of the Board for the Marco Gonzalez Maya Site Janet Brown, which is located south of San Pedro Town. Prior to their presentation, the delegation had the opportunity to visit the site where a Maya trading post once thrived.\nLater that day, around 8:30PM a large group of students gathered at the Angel Nu\u00f1ez Auditorium eager to learn about the presentation. Brown welcomed everyone and thanked the students for taking some of their time to assist with the evening\u2019s program. Dr. Allan Moore from the Institute of Archeology addressed the students about Belize\u2019s archaeology and the different programs the institution is engaged in. Moore indicated that the issue of looting continues, but they are hopeful that with the renewal of the MOU and the partnership with the United States, such activity can be drastically reduced. The presentation also covered their efforts to deter looting by encouraging young Belizeans and educating them about the importance of keeping these objects in Belize. They believe that through programs in the tourism sector, people will realize how important it is to preserve these sites and artifacts for tourism, which in return creates jobs.\nDr. Cohen held an interactive session with the students, explaining to them the key points of the Memorandum on the illegal trafficking of artifacts. \u201cIn this binding treaty, the United States will restrict the importation of archaeological material from Belize. So what that means is anytime someone or a shipment is found coming from Belize, and they have archaeological material, a permit from the Belize Institute of Archeology is required. If there is no permit, the material is seized and return to Belize.\u201d Cohen stated that there is no penalty at the time other than the artifacts being confiscated.\nHe explained that the renewal of the MOU is done every five years, due to the fact that the intention is to solve a problem that is considered temporary. \u201cIdeally, we would like to get to a point when people are not digging their antiquities and when there will not be a market for looted material,\u201d said Cohen. According to Cohen, not many countries have asked for this type agreement. \u201cThere are fewer than 20 countries, so it is a special relationship,\u201d he said.\nCohen encouraged everyone to get to know what their past contains, in particular, to find some time to learn more about the colonial architecture in Belize City and the wealth of the Belizean history that makes the country so rich. He believes that as people learn about these things, they will be more interested in preserving it and pass that on to their children. Before departing the island on Thursday, Cohen also had the opportunity to address the students of SPRCS in another informational session on the importance of curbing the illegal trade of archaeological artifacts, and the need to preserve Belize\u2019s natural heritage.\nIn 2013, Belize signed the MOU for the first time in an effort to tackle the looting of artifacts and increase lawful access to cultural objects and awareness. The historic agreement was signed at that time, by U.S Ambassador His Excellency Vinai Thummalapally and current Belize\u2019s Minister of Tourism and Civil Aviation, Honorable Manuel Heredia Jr. At the signing ceremony in Belize City, it was indicated that the event had been a culmination of years of hard work dating back to 2008 when the drafting of the proposal began. The proposal was submitted in 2010, then reviewed and sent back for minor modifications in 2011, before it was signed two years later.\nThe new agreement will be signed on Friday, February 23rd for another five years of bilateral commitment with the United States to ensure the protection of Belize\u2019s natural heritage.\nFollow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", "url": "https://www.sanpedrosun.com/conservation/2018/02/23/island-students-learn-mou-cultural-protection-preservation/"},
{"date": "2018-02-23T09:00:05-06:00", "topic": "Sports", "title": "San Pedro Co-Ed Softball Tournament semifinals begin", "text": "The San Pedro Co-Ed Softball Tournament semifinals kicked-off this past weekend on Sunday, February 18th. The first round of semifinals saw two intense matches taking place at the Hon. Louis Sylvestre Sporting Complex in San Pedro Town.\nThe first match started at 9:30AM with Quality Reef taking on San Pedro High School (SPHS). At the end of the game, Quality Reef managed to come out victorious by tallying seven home points against SPHS with six home points. The next match saw RCGO taking an easy victory with 12 home points against Extreme with five home points.\nThe semifinals continue this upcoming weekend on Sunday, February 25th, starting at 9:30 with Extreme versus RCGO, followed by Quality going against SPHS at 11:30AM. Everyone is invited to come and support their favorite team during the semifinals playoffs. Food and drinks will be on sale during the games.\nFollow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", "url": "https://www.sanpedrosun.com/sports/2018/02/23/san-pedro-co-ed-softball-tournament-semifinals-begin/"},
{"date": "2018-02-23T13:00:07-06:00", "topic": "Education", "title": "Ocean Academy hosts annual Ubuntu open day and honors mentors", "text": "The Ocean Academy (OA), hosted their third annual \u2018Ubuntu\u2019 open day on Thursday, February 15th at the Palapa Gardens in Caye Caulker. Ubuntu is an ancient African word meaning \u2018humanity to others\u2019. It also means \u2018I am what I am because of who we all are\u2019. The Ocean Academy embraced the word as symbolizing that OA exists thanks to the continued support of the community. The event showcased multiple booths featuring the many activities the high school offers to their students, including arts, music, culinary arts, electrical and even karate. The activity also served to recognize several mentors, who with their guidance have helped the students to explore their interests and talents.\nThe all-day event was open to the general public who got an insight on the diversity of the school\u2019s curriculum. Primary school students also had the opportunity to visit the different booths and learn what each had to offer. Speaking with OA Principal Heidi Curry, she explains the event showcases the students\u2019 entrepreneurship and apprenticeship ideas; a reflection of that particular student\u2019s passion. \u201cWe let the students choose what booth they want to participate in, so they enjoy and share it with everyone,\u201d said Curry. \u201cOur view of education is to be mindful, positive, learn to be passionate by experiencing what they like and then figure out how to create a legacy with that passion.\u201d\nCurry stated that their partnership with mentors from the local business community has significantly helped the students to explore their passion. \u201cMentors come from different fields, including the traffic department, doctors, scientists, tourism or cosmetology. This activity helps to create self-awareness in the students,\u201d she said. \u201cFor example, if a student likes math, then maybe they can have a mentor who is an engineer or a pilot and see all the possibilities they can achieve by using math or any subject of their choice.\u201d At the event, many students showcased their devotion to cosmetology, electricity, music, martial arts among other subjects. Curry added that the school is where it is at this moment thanks to the partnership with the community via the mentorship program.\nLater in the day in a short ceremony, several mentors were honored with a certificate of appreciation presented to them by the student they have been mentoring. The mentors were thanked and commended for giving some of their valuable time in helping and inspiring the students. It was also mentioned that many students stay in school because of the mentorship program at OA.\nCaye Caulker Chairwoman, Enelda Rosado, was present during the ceremony and commended the school for all the work they have been doing in with the community. \u201cI must applaud the Ocean Academy and the business community for the support they offer to the students,\u201d said Rosado. \u201cOur students are very talented and today they are here showcasing their passions and they could not do it without your support.\u201d After the ceremony, mentors and students mingled while others visited the different booths admiring the projects from the students.\nThis interactive community event started in 2016 after OA students won a high school innovation challenge competition in which they introduced the significant role of mentorship. The educational competition was held in Belize City where OA competed against 17 other high schools from Belize. They swayed the judges with their creative solutions to enhance the education system and took home the first place. The annual event was then created to showcase what the mentorship system can do and to honor those who give some of their time to guide and support students.\nFollow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", "url": "https://www.sanpedrosun.com/education/2018/02/23/ocean-academy-hosts-annual-ubuntu-open-day-honors-mentors/"}]
\ No newline at end of file
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class SanpedrosunSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for sanPedroSun project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sanPedroSun'
SPIDER_MODULES = ['sanPedroSun.spiders']
NEWSPIDER_MODULE = 'sanPedroSun.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'sanPedroSun (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'sanPedroSun.middlewares.SanpedrosunSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'sanPedroSun.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sanPedroSun.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from sanPedroSun.items import NoticiasItem
"""
MEDIO:
The San Pedro Sun, Belice
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class importantData(scrapy.Item):
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
globalSet = set()
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&day=" + day + "&posts_per_page=8&n_view=standard&n_style=list"
# self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&posts_per_page=8&n_view=standard&n_style=list"
searchData = importantData()
searchData['page'] = 0
request = scrapy.Request(url=self.baseURL, callback=self.parse)
request.meta['item'] = searchData
yield request
def parse(self, response):
localSet = set(response.css('div.entry').xpath('./h2/a/@href').extract())
resultSet = localSet - self.globalSet
if len(resultSet) > 0:
searchData = response.meta['item']
for link in resultSet:
self.globalSet.add(link)
yield scrapy.Request(url=link, callback=self.parse_item)
searchData['page'] += 1
page = searchData['page']
request = scrapy.Request(url=response.url + "&n_more=" + str(page), callback=self.parse)
request.meta['item'] = searchData
yield request
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.item-details').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="post"]/h1').extract_first()).strip()
try:
topic = response.xpath('//div[@class="breadcrumb"]/p/a/text()').extract()[-1]
except:
try:
topic = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
except:
topic = None
item['topic'] = topic
for p in response.xpath('//div[@class="entry"]').css('p').extract():
text += remove_tags(p) + "\n"
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = sanPedroSun.settings
[deploy]
#url = http://localhost:6800/
project = sanPedroSun
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tiempoDigitalHn.settings
[deploy]
#url = http://localhost:6800/
project = tiempoDigitalHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TiempodigitalhnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tiempoDigitalHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tiempoDigitalHn'
SPIDER_MODULES = ['tiempoDigitalHn.spiders']
NEWSPIDER_MODULE = 'tiempoDigitalHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tiempoDigitalHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tiempoDigitalHn.middlewares.TiempodigitalhnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tiempoDigitalHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tiempoDigitalHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from tiempoDigitalHn.items import NoticiasItem
"""
MEDIO:
Tiempo Digital, Honduras
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE1 = re.compile(r'\n([A-Z]+ )+ ?[.-]')
LOC_RE2 = re.compile(r'\n.+?,? ?.+? ?(\. ?-|\.|-) ?[A-Z]')
SOURCE_RE = re.compile(r'\n ?Fuente:.+$')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://tiempo.hn/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').xpath('./a/@href').extract()
if len(pagination) > 0:
try:
pagination = pagination[-2]
except:
pagination = pagination[-1]
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.td_module_1').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//span[@class="td-post-date"]/time/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="td-post-title"]/h1').extract_first()).strip()
try:
topic = response.xpath('//ul[@class="td-category"]/li').extract()[-1]
item['topic'] = remove_tags(topic)
except:
item['topic'] = None
author = response.xpath('//div[@class="td-post-author-name"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = LOC_RE1.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = LOC_RE2.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').replace(".", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = SOURCE_RE.search(text)
if res:
m = res.group(0)
text = text.replace(m, '').strip()
text = "\n" + text
item['text'] = text.strip()
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment