Commit b3970bbd authored by Mario Chirinos's avatar Mario Chirinos

reorganizacion

parent 067f09e9
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class LajornadaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LajornadaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
"""
MEDIA:
La Jornada, CDMX
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd laJornada/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re, json
from laJornada.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://www.jornada.com.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
section_list = ["politica", "mundo", "capital", "cultura", "deportes",
"economia", "sociedad", "estados", "espectaculos"]
for s in section_list:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
link_set = set(response.css('div.section-cont').css('a.cabeza::attr(href)').extract())
for link in link_set:
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d, t = news_date.split("T")
if d != self.this_date:
news_date = d[:d.rfind('-')] + "-" + self.day.zfill(2) + "T" + t
title = response.css('div.cabeza').extract_first()
if title is not None: title = remove_tags(title)
topic = response.css('img.title::attr(title)').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.css('div.text').css('p').extract():
p = p.replace("<br>", "\n")
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
File moved
File moved
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
import datetime
#from myModule import myModule
#===============================================================================
def findLastDate(directory):
path = Path(directory)
dirs = [e.name for e in path.iterdir() if e.is_dir()]
dirs.sort()
path = Path(directory+dirs[-1])
files = [e.name[:-5] for e in path.glob("*.json")]
files.sort()
date = datetime.datetime.strptime(files[-1], '%Y-%m-%d')
return date
#===============================================================================
def updateDir(directory):
startDate = findLastDate(directory)
endDate = datetime.datetime.now()
delta = endDate-startDate
for i in range(delta.days + 1):
day = startDate + datetime.timedelta(days=i)
print(day)
#===============================================================================
def main(argv):
if len(sys.argv) != 2:
print ("Usage text")
else:
updateDir(argv[1])
if __name__ == "__main__":
main(sys.argv)
File moved
File moved
File moved
File moved
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment