crawlers

7d116982 · Renán Sosa Guillen · 81ee8b42 · 7d116982 · 7d116982 · 7d116982
Commit 7d116982 authored Dec 01, 2017 by Renán Sosa Guillen
5 changed files
--- a/descarga_por_dia/puntoMedio/puntoMedio/__init__.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/__init__.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/__init__.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/__init__.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
 import scrapy, re

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+'''
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+'''

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -21,11 +21,12 @@ class NoticiasItem(scrapy.Item):

 class QuotesSpider(scrapy.Spider):
  name = "noticias"
+
  def start_requests(self):
    year = getattr(self, 'year', None)
    month = getattr(self, 'month', None)
    day = getattr(self, 'day', None)
-		self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day
+    self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day

    yield scrapy.Request(url=self.baseURL, callback=self.parse)

@@ -34,8 +35,8 @@ class QuotesSpider(scrapy.Spider):
    for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
      yield scrapy.Request(url=link, callback=self.parse_item)

-		next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
-		yield scrapy.Request(url=next_page, callback=self.parse)	
+    nextPage = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
+    yield scrapy.Request(url=nextPage, callback=self.parse)


  def parse_item(self, response):
@@ -49,12 +50,14 @@ class QuotesSpider(scrapy.Spider):
      d = d[:-6] + '-06:00'
    item['date'] = d

-		item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first()
+    item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()

-		for paragraph in response.css('div.post-entry').css('p').extract():
-			text += remove_tags(paragraph)
+    for p in response.css('div.post-entry').css('p').extract():
+      text += remove_tags(p)
    item['text'] = text
+
    item['url'] = response.url

    # print item['title']
    yield item
+
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc