crawlers

95002f31 · Renán Sosa Guillen · 168936ef · 168936ef · 168936ef · 95002f31
Commit 95002f31 authored Jan 11, 2018 by Renán Sosa Guillen
23 changed files
--- a/crawler_rss.json
+++ b/crawler_rss.json
-[{"nombre": "El Financiero", "crawler": "descarga_por_rss/elFinanciero", "url": "http://www.elfinanciero.com.mx/"},
-{"nombre": "El Universal", "crawler": "descarga_por_rss/elUniversal", "url": "http://www.eluniversal.com.mx/"},
-{"nombre": "El Sol de Mexico", "crawler": "descarga_por_rss/solDeMex", "url": "https://www.elsoldemexico.com.mx"}]
\ No newline at end of file
--- a/crawler_script/crawl_rss.py
+++ b/crawler_script/crawl_rss.py
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-import sys
-import json
-import os
-import datetime
-from collections import OrderedDict
-today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
-row = {}
-with open(sys.argv[1]) as data_file:
-    siteList = json.load(data_file)
-    os.chdir(baseDir)
-    for s in siteList:
-        # desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
-        desde = today
-        print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
-        media = s['crawler'][s['crawler'].find("/") + 1:]
-        try:
-            os.makedirs(media)
-        except:
-            print "ok"
-        os.chdir(media)
-        CORRECT_YEAR = False
-        while not CORRECT_YEAR:
-            lstYears = os.listdir(".")
-            lstYears.sort()
-            year = desde.year
-            if len(lstYears) > 0:
-                element = lstYears[len(lstYears) - 1]
-                if element[-4:] == 'json':
-                    os.system('rm ' + element)
-                else:
-                    CORRECT_YEAR = True
-        year = int(element)
-        for y in range(year, today.year + 1):
-            print y
-            try:
-                os.makedirs(str(y))
-            except:
-                print "ok"
-            os.chdir(str(y))
-            #			print os.getcwd()
-            lstDays = os.listdir(".")
-            lstDays = [l for l in lstDays if not l.startswith('.')]
-            lstDays.sort()
-            print lstDays
-            day = desde.timetuple().tm_yday
-            print day
-            currentDate = desde
-            if len(lstDays) > 0:
-                strDate = lstDays[len(lstDays) - 1]
-                strDate = strDate[:strDate.find(".")]
-                currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
-                day = currentDate.timetuple().tm_yday
-            elif y != desde.year:
-                currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
-                day = 1
-            for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
-                YESTERDAY = False
-                filename = currentDate.strftime('%Y-%m-%d') + ".json"
-                scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename
-                mydir = os.getcwd()
-                print mydir
-                os.chdir(scrapyDir + s['crawler'])
-                print media
-                print scrapycommand
-                os.system(scrapycommand)
-                fileSize = os.stat(filename).st_size
-                if fileSize <= 3:
-                    os.system('rm ' + filename)
-                else:
-                    f1 = mydir + '/' + filename
-                    f2 = filename
-                    f3 = baseDir + media + '/' + filename
-                    try:
-                        with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
-                            master = json.load(infile1)
-                            slave = json.load(infile2)
-                            urlSet = set([line['url'] for line in master])
-                            counter = 0
-                            infile3.write('[')
-                            for line in master:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
-                                if lineDate == currentDate:
-                                    counter += 1
-                                    if media == 'elFinanciero' or media == 'solDeMex':
-                                        row = OrderedDict([
-                                            ('date', line['date']),
-                                            ('topic', line['topic']),
-                                            ('title', line['title']),
-                                            ('author', line['author']),
-                                            ('url', line['url']),
-                                            ('text', line['text'])
-                                        ])
-                                    elif media == 'elUniversal':
-                                        row = OrderedDict([
-                                            ('date', line['date']),
-                                            ('topic', line['topic']),
-                                            ('title', line['title']),
-                                            ('author', line['author']),
-                                            ('location', line['location']),
-                                            ('url', line['url']),
-                                            ('text', line['text'])
-                                        ])
-                                    if counter == 1:
-                                        infile3.write(json.dumps(row))
-                                    elif counter > 1:
-                                        infile3.write(',\n' + json.dumps(row))
-                            for line in slave:
-                                if not line['url'] in urlSet:
-                                    lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
-                                    if lineDate == currentDate:
-                                        if media == 'elFinanciero' or media == 'solDeMex':
-                                            row = OrderedDict([
-                                                ('date', line['date']),
-                                                ('topic', line['topic']),
-                                                ('title', line['title']),
-                                                ('author', line['author']),
-                                                ('url', line['url']),
-                                                ('text', line['text'])
-                                            ])
-                                        elif media == 'elUniversal':
-                                            row = OrderedDict([
-                                                ('date', line['date']),
-                                                ('topic', line['topic']),
-                                                ('title', line['title']),
-                                                ('author', line['author']),
-                                                ('location', line['location']),
-                                                ('url', line['url']),
-                                                ('text', line['text'])
-                                            ])
-                                        infile3.write(',\n' + json.dumps(row))
-                                    elif (currentDate - lineDate).days == 1:
-                                        YESTERDAY = True
-                            infile3.write(']')
-                        os.system("mv " + f3 + " " + mydir)
-                        # os.system("rm " + f2)
-                    except:
-                        os.system("cp " + f2 + " " + mydir)
-                    if YESTERDAY:
-                        currentDate -= datetime.timedelta(days=1)
-                        filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
-                        f1 = mydir + '/' + filenameYesterday
-                        f2 = filename
-                        f3 = baseDir + media + '/' + filenameYesterday
-                        with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
-                            master = json.load(infile1)
-                            slave = json.load(infile2)
-                            urlSet = set([line['url'] for line in master])
-                            counter = 0
-                            infile3.write('[')
-                            for line in master:
-                                counter += 1
-                                if media == 'elFinanciero' or media == 'solDeMex':
-                                    row = OrderedDict([
-                                        ('date', line['date']),
-                                        ('topic', line['topic']),
-                                        ('title', line['title']),
-                                        ('author', line['author']),
-                                        ('url', line['url']),
-                                        ('text', line['text'])
-                                    ])
-                                elif media == 'elUniversal':
-                                    row = OrderedDict([
-                                        ('date', line['date']),
-                                        ('topic', line['topic']),
-                                        ('title', line['title']),
-                                        ('author', line['author']),
-                                        ('location', line['location']),
-                                        ('url', line['url']),
-                                        ('text', line['text'])
-                                    ])
-                                if counter == 1:
-                                    infile3.write(json.dumps(row))
-                                elif counter > 1:
-                                    infile3.write(',\n' + json.dumps(row))
-                            for line in slave:
-                                lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
-                                if not line['url'] in urlSet and lineDate == currentDate:
-                                    if media == 'elFinanciero' or media == 'solDeMex':
-                                        row = OrderedDict([
-                                            ('date', line['date']),
-                                            ('topic', line['topic']),
-                                            ('title', line['title']),
-                                            ('author', line['author']),
-                                            ('url', line['url']),
-                                            ('text', line['text'])
-                                        ])
-                                    elif media == 'elUniversal':
-                                        row = OrderedDict([
-                                            ('date', line['date']),
-                                            ('topic', line['topic']),
-                                            ('title', line['title']),
-                                            ('author', line['author']),
-                                            ('location', line['location']),
-                                            ('url', line['url']),
-                                            ('text', line['text'])
-                                        ])
-                                    infile3.write(',\n' + json.dumps(row))
-                            infile3.write(']')
-                        os.system("mv " + f3 + " " + mydir)
-                    os.system("rm " + f2)
-                os.chdir(mydir)
-                if YESTERDAY:
-                    currentDate += datetime.timedelta(days=2)
-                else:
-                    currentDate += datetime.timedelta(days=1)
-            os.chdir("..")
-        os.chdir("..")
-print today.year
-#	scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/__init__.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/__init__.pyc
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.py
@@ -18,22 +18,25 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'diarioYucatan (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS=32
+#CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-# DOWNLOAD_DELAY=3
+#DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN=16
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP=16
+#CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-COOKIES_ENABLED=False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED=False
+#TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
@@ -44,7 +47,7 @@ COOKIES_ENABLED=False
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    'diarioYucatan.middlewares.MyCustomSpiderMiddleware': 543,
+#    'diarioYucatan.middlewares.DiarioyucatanSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
@@ -56,30 +59,32 @@ COOKIES_ENABLED=False
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
-#    'scrapy.telnet.TelnetConsole': None,
+#    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    'diarioYucatan.pipelines.SomePipeline': 300,
+#    'diarioYucatan.pipelines.DiarioyucatanPipeline': 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
+#AUTOTHROTTLE_ENABLED = True
-#AUTOTHROTTLE_ENABLED=True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY=5
+#AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY=60
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG=False
+#AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED=True
+#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS=0
+#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR='httpcache'
+#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES=[]
+#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.pyc
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/__init__.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/__init__.pyc
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
 import scrapy, re
-from datetime import datetime, date, timedelta
-from scrapy.spidermiddlewares.httperror import HttpError
+# from datetime import datetime, date, timedelta
+# from scrapy.spidermiddlewares.httperror import HttpError
 """
-Esta version descarga ingresando una fecha.
+Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
+de una fecha especifica.
 USO:
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=09 -a day=13
+scrapy crawl noticias -t json --nolog -o noticias.json
-No es recomendable para fechas de mas de un mes de antiguas.
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -30,81 +32,58 @@ class QuotesSpider(scrapy.Spider):
    name = "noticias"
    def start_requests(self):
-		section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
+        section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes', 'mexico/quintana-roo',
-										'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
+                        'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud', 'editorial', 'multimedia']
+        # section_list = ['yucatan', 'salud', 'editorial', 'imagen',
-		year = getattr(self, 'year', None)
+        #                 'merida', 'merida/clima', 'merida/gobierno', 'merida/policia', 'merida/politica',
-		month = getattr(self, 'month', None)
+        #                 'mexico', 'mexico/quintana-roo', 'mexico/cdmx', 'mexico/economia', 'mexico/campeche',
-		day = getattr(self, 'day', None)
+        #                 'internacional', 'internacional/asia', 'internacional/europa', 'internacional/africa',
-		self.baseURL='http://yucatan.com.mx/seccion/'
+        #                 'internacional/america', 'internacional/oceania',
-		self.date = date(int(year), int(month), int(day))
+        #                 'deportes', 'deportes/futbol', 'deportes/nfl',
-		self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
+        #                 'espectaculos', 'espectaculos/cine', 'espectaculos/farandula', 'espectaculos/musica',
-													 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
+        #                 'espectaculos/tv-y-series',
+        #                 'tecnologia', 'tecnologia/redes-sociales', 'tecnologia/innovaciones',
-		self.pages = 100
+        #                 'multimedia', 'multimedia/fotos', 'multimedia/videos']
-		for s in section_list:
-			yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
+        self.globalLinkSet = set()
+        self.baseURL = 'http://www.yucatan.com.mx/seccion/'
+        self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
+                              'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
+        for s in section_list:
+            yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
    def parse(self, response):
-		if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
+        pathList = ['//*[@class="g1-column"]/div/div/article',
-			for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
+                    '//*[@class="g1-collection g1-collection-columns-2"]/div/ul/li/article']
-				yield scrapy.Request(url=link, callback=self.parse_pagination)
-		elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
-			yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
+        for path in pathList:
+            for entry in response.xpath(path):
+                link = entry.css('h3').css('a::attr(href)').extract_first()
-	def parse_pagination(self, response):
+                if not link in self.globalLinkSet:
-		pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
+                    self.globalLinkSet.add(link)
-		if ( len(pagination) > 0 ):
+                    item = NoticiasItem()
-			p = 1
-			while p <= self.pages:
-				if ( p == 1 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
-				elif ( p > 1 ):
-					yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
-				p += 1
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
-	def parse_link(self, response):
-		for entry in response.xpath('//*[@class="bp-entry"]'):
-			entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
-			entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
-			news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
+                    item['topic'] = remove_tags(response.xpath('//h2[@class="g1-delta g1-delta-2nd resaltartitulo"]').extract_first())
-			link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
+                    request = scrapy.Request(url=link, callback=self.parse_item)
-			if news_date == self.date and link is not None:
+                    request.meta['item'] = item
-				yield scrapy.Request(url=link, callback=self.parse_item)
+                    yield request
    def parse_item(self, response):
+        item = response.meta['item']
        text = ''
-		item = NoticiasItem()
-		item['title'] = response.css('h1.entry-title::text').extract_first()
-		d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
+        item['date'] = response.xpath('//time[@class="entry-date"]/@datetime').extract_first() + "-06:00"
-		if d is None:
+        item['title'] = remove_tags(response.xpath('//h1[@class="g1-mega g1-mega-1st entry-title"]').extract_first())
-			d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
-			if d is None:
-				d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
+        for p in response.xpath('//*[@itemprop="articleBody"]/p').extract():
-		if d[-6:] != '-06:00':
+            text += remove_tags(p) + "\n"
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-		for paragraph in response.css('div.entry-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'		
        item['text'] = text
-		item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
        item['url'] = response.url
        # print item['title']
        yield item

--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/__init__.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/__init__.py
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/__init__.pyc
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/__init__.pyc
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/items.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/items.py
-# -*- coding: utf-8 -*-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
-import scrapy
-class Diarioyucatan2Item(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/middlewares.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/middlewares.py
-# -*- coding: utf-8 -*-
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
-from scrapy import signals
-class Diarioyucatan2SpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-    def process_spider_input(response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-        # Should return None or raise an exception.
-        return None
-    def process_spider_output(response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
-    def process_spider_exception(response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
-    def process_start_requests(start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/pipelines.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/pipelines.py
-# -*- coding: utf-8 -*-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-class Diarioyucatan2Pipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/settings.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/settings.py
-# -*- coding: utf-8 -*-
-# Scrapy settings for diarioYucatan2 project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     http://doc.scrapy.org/en/latest/topics/settings.html
-#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-BOT_NAME = 'diarioYucatan2'
-SPIDER_MODULES = ['diarioYucatan2.spiders']
-NEWSPIDER_MODULE = 'diarioYucatan2.spiders'
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'diarioYucatan2 (+http://www.yourdomain.com)'
-# Obey robots.txt rules
-# ROBOTSTXT_OBEY = True
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-# Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
-# Disable cookies (enabled by default)
-COOKIES_ENABLED = False
-# Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
-# Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
-# Enable or disable spider middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
-#    'diarioYucatan2.middlewares.Diarioyucatan2SpiderMiddleware': 543,
-#}
-# Enable or disable downloader middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    'diarioYucatan2.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
-# Enable or disable extensions
-# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
-#    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
-# Configure item pipelines
-# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'diarioYucatan2.pipelines.Diarioyucatan2Pipeline': 300,
-#}
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
-# Enable and configure HTTP caching (disabled by default)
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/settings.pyc
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/settings.pyc
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/__init__.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/__init__.py
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/__init__.pyc
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/__init__.pyc
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.py
-import scrapy, re
-# from datetime import datetime, date, timedelta
-# from scrapy.spidermiddlewares.httperror import HttpError
-"""
-Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
-de una fecha especifica.
-USO:
-scrapy crawl noticias -t json --nolog -o noticias.json
-Genera un archivo JSON con todas las noticias disponibles. El archivo 'parse_date_file.py'
-puede servir para clasificar dichas noticias en sus respectivas fechas.
-"""
-TAG_RE = re.compile(r'<[^>]+>')
-def remove_tags(text):
-	return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-class QuotesSpider(scrapy.Spider):
-	name = "noticias"
-	def start_requests(self):
-		section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
-										'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
-		# year = getattr(self, 'year', None)
-		# month = getattr(self, 'month', None)
-		# day = getattr(self, 'day', None)
-		self.baseURL='http://yucatan.com.mx/seccion/'
-		# self.date = date(int(year), int(month), int(day))
-		self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
-													 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
-		# self.pages = 100
-		for s in section_list:
-			yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
-	def parse(self, response):
-		if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
-			for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
-				yield scrapy.Request(url=link, callback=self.parse_pagination)
-		elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
-			yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
-	def parse_pagination(self, response):
-		pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1]
-			pages = int(pagination[pagination.rfind('/')+1:])
-			p = 1
-			while p <= pages:
-				if ( p == 1 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
-				elif ( p > 1 ):
-					yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
-				p += 1
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
-	def parse_link(self, response):
-		for entry in response.xpath('//*[@class="bp-entry"]'):
-			# entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
-			# entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
-			# news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
-			link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
-			if link is not None:
-				yield scrapy.Request(url=link, callback=self.parse_item)
-	def parse_item(self, response):
-		text = ''
-		item = NoticiasItem()
-		item['title'] = response.css('h1.entry-title::text').extract_first()
-		d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
-		if d is None:
-			d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
-			if d is None:
-				d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-		for paragraph in response.css('div.entry-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'		
-		item['text'] = text
-		item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
--- a/descarga_hacia_atras/diarioYucatan2/scrapy.cfg
+++ b/descarga_hacia_atras/diarioYucatan2/scrapy.cfg
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.org/en/latest/deploy.html
-[settings]
-default = diarioYucatan2.settings
-[deploy]
-#url = http://localhost:6800/
-project = diarioYucatan2
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
@@ -639,7 +639,8 @@ class QuotesSpider(scrapy.Spider):
            # item['date'] = self.date
            item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T')
-            item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
+            title = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
+            item['title'] = " ".join(title.split())
            item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
            author = response.xpath('//*[@class="credito-autor"]/text()').extract_first()

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
--- a/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.pyc
+++ b/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.pyc