crawlers

cbeb18b8 · Renán Sosa Guillen · 24040d75 · cbeb18b8 · cbeb18b8 · cbeb18b8
Commit cbeb18b8 authored Dec 18, 2018 by Renán Sosa Guillen
16 changed files
--- a/crawler_script/tracker_proceso.py
+++ b/crawler_script/tracker_proceso.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+    Script para el trackeo del medio PROCESO.
+"""
+
+import sys
+import os
+
+
+
+baseDir   = "/home/geoint/virtualHDD/m3/noticias/"
+scrapyDir = "/home/geoint/crawlersNoticias/"
+s = {"crawler": "descarga_por_mes/proceso"}
+
+media = s['crawler'][s['crawler'].rfind("/")+1:]
+
+os.chdir(baseDir)
+
+try:
+    os.makedirs(media)
+except:
+    print "ok"
+
+os.chdir(media)
+
+# rango va de 1976 a 2018
+for year in xrange(1976, 2019):
+    try:
+        os.makedirs(str(year))
+    except:
+        print "ok"
+
+    os.chdir(str(year))
+    filename = str(year) + ".json"
+
+    scrapycommand = "scrapy crawl noticias --nolog -s filename={0} -a year={1}".format(filename, str(year))
+
+    mydir = os.getcwd()
+    print mydir
+    os.chdir(scrapyDir + s['crawler'])
+    print media
+    print scrapycommand
+    os.system(scrapycommand)
+    fileSize = os.stat(filename).st_size
+    if fileSize <= 3: os.system("rm " + filename)
+    else:
+        os.chdir(scrapyDir)
+        sys_command = "python3 parse_date_files.py {0} {1}".format(s['crawler'], filename)
+        os.system(sys_command)
+        os.chdir(media)
+        mediaYears = os.listdir(".")
+        mediaYears.sort()
+
+        for yy in mediaYears:
+            os.chdir(yy)
+            try:
+                os.makedirs(baseDir + media + "/" + yy)
+            except:
+                pass
+            mediaDays = os.listdir(".")
+            mediaDays = [l for l in mediaDays if not l.startswith('.')]
+            mediaDays.sort()
+
+            for dd in mediaDays:
+                os.system("mv " + dd + " " + baseDir + media + "/" + yy)
+
+            os.chdir("..")
+            os.system("rm -R " + yy)
+
+        os.chdir("..")
+        os.system("rm -R " + media)
+        os.chdir(s['crawler'])
+        os.system("rm " + filename)
+        os.chdir(mydir)
+
+    os.chdir("..")
+    # os.chdir("..")
--- a/descarga_por_mes/proceso/proceso/middlewares.py
+++ b/descarga_por_mes/proceso/proceso/middlewares.py
@@ -5,100 +5,52 @@
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html

-import os, tempfile, time, sys, logging, dryscrape
-from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
+from scrapy import signals

-logger = logging.getLogger(__name__)

+class ProcesoSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.

-class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
-    def __init__(self, settings):
-        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s

-        # start xvfb to support headless scraping
-        if 'linux' in sys.platform:
-            dryscrape.start_xvfb()
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.

-        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
-        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
-            # seems to be a bug with how webkit-server handles accept-encoding
-            if key.lower() != 'accept-encoding':
-                self.dryscrape_session.set_header(key, value)
+        # Should return None or raise an exception.
+        return None

-    def _redirect(self, redirected, request, spider, reason):
-        # act normally if this isn't a threat defense redirect
-        if not self.is_threat_defense_url(redirected.url):
-            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.

-        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
-        request.cookies = self.bypass_threat_defense(redirected.url)
-        request.dont_filter = True # prevents the original link being marked a dupe
-        return request
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i

-    def is_threat_defense_url(self, url):
-        return 'proceso.com.mx' in url
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.

-    def bypass_threat_defense(self, url=None):
-        # only navigate if any explicit url is provided
-        if url:
-            self.dryscrape_session.visit(url)
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass

-        # solve the captcha if there is one
-        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
-        # if len(captcha_images) > 0:
-        #     return self.solve_captcha(captcha_images[0])
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.

-        # click on any explicit retry links
-        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
-        # if len(retry_links) > 0:
-        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r

-        # otherwise, we're on a redirect page so wait for the redirect and try again
-        self.wait_for_redirect()
-        return self.bypass_threat_defense()
-
-    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
-        url = url or self.dryscrape_session.url()
-        for i in range(int(timeout//wait)):
-            time.sleep(wait)
-            if self.dryscrape_session.url() != url:
-                return self.dryscrape_session.url()
-        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
-        raise Exception('Timed out on the zipru redirect page.')
-
-    # def solve_captcha(self, img, width=1280, height=800):
-    #     # take a screenshot of the page
-    #     self.dryscrape_session.set_viewport_size(width, height)
-    #     filename = tempfile.mktemp('.png')
-    #     self.dryscrape_session.render(filename, width, height)
-
-    #     # inject javascript to find the bounds of the captcha
-    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
-    #     rect = self.dryscrape_session.eval_script(js)
-    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
-
-    #     # solve the captcha in the screenshot
-    #     image = Image.open(filename)
-    #     os.unlink(filename)
-    #     captcha_image = image.crop(box)
-    #     captcha = pytesseract.image_to_string(captcha_image)
-    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
-
-    #     # submit the captcha
-    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
-    #     input.set(captcha)
-    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
-    #     url = self.dryscrape_session.url()
-    #     button.click()
-
-    #     # try again if it we redirect to a threat defense URL
-    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
-    #         return self.bypass_threat_defense()
-
-    #     # otherwise return the cookies as a dict
-    #     cookies = {}
-    #     for cookie_string in self.dryscrape_session.cookies():
-    #         if 'domain=zipru.to' in cookie_string:
-    #             key, value = cookie_string.split(';')[0].split('=')
-    #             cookies[key] = value
-    #     return cookies
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_mes/proceso/proceso/settings.py
+++ b/descarga_por_mes/proceso/proceso/settings.py
@@ -16,7 +16,7 @@ NEWSPIDER_MODULE = 'proceso.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+#USER_AGENT = 'proceso (+http://www.yourdomain.com)'

 # Obey robots.txt rules
 # ROBOTSTXT_OBEY = True
@@ -33,38 +33,28 @@ DOWNLOAD_DELAY = 1
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-COOKIES_ENABLED = True
-COOKIES_DEBUG = True
-SPLASH_COOKIES_DEBUG = True
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-# DEFAULT_REQUEST_HEADERS = {
-#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#     'User-Agent': USER_AGENT,
-#     'Connection': 'Keep-Alive',
-#     # 'Accept-Encoding': 'gzip, deflate',
-#     'Accept-Language': 'en',
-# }
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-SPIDER_MIDDLEWARES = {
-   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
-   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
-}
+# SPIDER_MIDDLEWARES = {
+#     'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+# }

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-DOWNLOADER_MIDDLEWARES = {
-    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
-    'scrapy_splash.SplashCookiesMiddleware': 723,
-    'scrapy_splash.SplashMiddleware': 725,
-    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
-    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
-}
+# DOWNLOADER_MIDDLEWARES = {
+#     'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
+# }

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
@@ -98,8 +88,3 @@ ITEM_PIPELINES = {
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
-
-SPLASH_URL = 'http://localhost:8050/'
-
-DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias.py
--- a/descarga_por_mes/proceso_org/proceso/__init__.py
+++ b/descarga_por_mes/proceso_org/proceso/__init__.py
--- a/descarga_por_mes/proceso_org/proceso/items.py
+++ b/descarga_por_mes/proceso_org/proceso/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso_org/proceso/middlewares.py
+++ b/descarga_por_mes/proceso_org/proceso/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+import os, tempfile, time, sys, logging, dryscrape
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
+
+logger = logging.getLogger(__name__)
+
+
+class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
+    def __init__(self, settings):
+        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
+
+        # start xvfb to support headless scraping
+        if 'linux' in sys.platform:
+            dryscrape.start_xvfb()
+
+        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
+        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
+            # seems to be a bug with how webkit-server handles accept-encoding
+            if key.lower() != 'accept-encoding':
+                self.dryscrape_session.set_header(key, value)
+
+    def _redirect(self, redirected, request, spider, reason):
+        # act normally if this isn't a threat defense redirect
+        if not self.is_threat_defense_url(redirected.url):
+            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
+
+        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
+        request.cookies = self.bypass_threat_defense(redirected.url)
+        request.dont_filter = True # prevents the original link being marked a dupe
+        return request
+
+    def is_threat_defense_url(self, url):
+        return 'proceso.com.mx' in url
+
+    def bypass_threat_defense(self, url=None):
+        # only navigate if any explicit url is provided
+        if url:
+            self.dryscrape_session.visit(url)
+
+        # solve the captcha if there is one
+        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
+        # if len(captcha_images) > 0:
+        #     return self.solve_captcha(captcha_images[0])
+
+        # click on any explicit retry links
+        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
+        # if len(retry_links) > 0:
+        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))
+
+        # otherwise, we're on a redirect page so wait for the redirect and try again
+        self.wait_for_redirect()
+        return self.bypass_threat_defense()
+
+    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
+        url = url or self.dryscrape_session.url()
+        for i in range(int(timeout//wait)):
+            time.sleep(wait)
+            if self.dryscrape_session.url() != url:
+                return self.dryscrape_session.url()
+        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
+        raise Exception('Timed out on the zipru redirect page.')
+
+    # def solve_captcha(self, img, width=1280, height=800):
+    #     # take a screenshot of the page
+    #     self.dryscrape_session.set_viewport_size(width, height)
+    #     filename = tempfile.mktemp('.png')
+    #     self.dryscrape_session.render(filename, width, height)
+
+    #     # inject javascript to find the bounds of the captcha
+    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
+    #     rect = self.dryscrape_session.eval_script(js)
+    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
+
+    #     # solve the captcha in the screenshot
+    #     image = Image.open(filename)
+    #     os.unlink(filename)
+    #     captcha_image = image.crop(box)
+    #     captcha = pytesseract.image_to_string(captcha_image)
+    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
+
+    #     # submit the captcha
+    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
+    #     input.set(captcha)
+    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
+    #     url = self.dryscrape_session.url()
+    #     button.click()
+
+    #     # try again if it we redirect to a threat defense URL
+    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
+    #         return self.bypass_threat_defense()
+
+    #     # otherwise return the cookies as a dict
+    #     cookies = {}
+    #     for cookie_string in self.dryscrape_session.cookies():
+    #         if 'domain=zipru.to' in cookie_string:
+    #             key, value = cookie_string.split(';')[0].split('=')
+    #             cookies[key] = value
+    #     return cookies
--- a/descarga_por_mes/proceso_org/proceso/pipelines.py
+++ b/descarga_por_mes/proceso_org/proceso/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_mes/proceso_org/proceso/settings.py
+++ b/descarga_por_mes/proceso_org/proceso/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for proceso project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'proceso'
+
+SPIDER_MODULES = ['proceso.spiders']
+NEWSPIDER_MODULE = 'proceso.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+SPLASH_COOKIES_DEBUG = True
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#     'User-Agent': USER_AGENT,
+#     'Connection': 'Keep-Alive',
+#     # 'Accept-Encoding': 'gzip, deflate',
+#     'Accept-Language': 'en',
+# }
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+SPIDER_MIDDLEWARES = {
+   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
+}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'proceso.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+
+SPLASH_URL = 'http://localhost:8050/'
+
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/settings_org.py
+++ b/descarga_por_mes/proceso/proceso/settings_org.py
--- a/descarga_por_mes/proceso_org/proceso/spiders/__init__.py
+++ b/descarga_por_mes/proceso_org/proceso/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_mes/proceso_org/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso_org/proceso/spiders/noticias.py
--- a/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
--- a/descarga_por_mes/proceso_org/scrapy.cfg
+++ b/descarga_por_mes/proceso_org/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = proceso.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = proceso
--- a/parse_date_files.py
+++ b/parse_date_files.py
+# -*- coding: utf-8 -*-
+
 import json, os, sys
 from datetime import datetime
 from collections import OrderedDict

 """
+Toma como entrada un archivo json que contiene noticias con diferentes fechas.
+Devuelve las noticias en carpetas separadas por año.
+
 Uso:
 python parse_date_files.py <ruta_del_crawler> <nombre_archivo>


--- a/parse_date_files2.py
+++ b/parse_date_files2.py
+# -*- coding: utf-8 -*-
+
 import json, os, sys
 from datetime import datetime
 from collections import OrderedDict

 """
+Parseo de fechas para las noticias descargadas del tipo 'descarga_hacia_atras'
+
 Uso:
 python parse_date_files.py <nombre_del_crawler>