crawlers

7d116982 · Renán Sosa Guillen · 81ee8b42 · 7d116982 · 7d116982 · 7d116982
Commit 7d116982 authored Dec 01, 2017 by Renán Sosa Guillen
5 changed files
--- a/descarga_por_dia/puntoMedio/puntoMedio/__init__.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/__init__.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/__init__.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/__init__.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
 import scrapy, re

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+'''
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+'''

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+  return TAG_RE.sub('', text)


 class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+  title = scrapy.Field()
+  text = scrapy.Field()
+  date = scrapy.Field()
+  location = scrapy.Field()
+  author = scrapy.Field()
+  topic = scrapy.Field()
+  url = scrapy.Field()


 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
-		self.baseURL='https://www.puntomedio.mx/'+year+'/'+month+'/'+day
-		
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
-
-
-	def parse(self, response):
-		for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
-			yield scrapy.Request(url=link, callback=self.parse_item)
-
-		next_page = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
-		yield scrapy.Request(url=next_page, callback=self.parse)	
-
-
-	def parse_item(self, response):
-		item = NoticiasItem()
-		text = ''
-		item['title'] = response.css('h1.title::text').extract_first()
-		
-		d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-
-		item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first()
-		
-		for paragraph in response.css('div.post-entry').css('p').extract():
-			text += remove_tags(paragraph)
-		item['text'] = text
-		item['url'] = response.url
-		
-		# print item['title']
-		yield item
+  name = "noticias"
+
+  def start_requests(self):
+    year = getattr(self, 'year', None)
+    month = getattr(self, 'month', None)
+    day = getattr(self, 'day', None)
+    self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
+
+    yield scrapy.Request(url=self.baseURL, callback=self.parse)
+
+
+  def parse(self, response):
+    for link in response.css('div.col-md-8').css('h2.title').css('a::attr(href)').extract():
+      yield scrapy.Request(url=link, callback=self.parse_item)
+
+    nextPage = response.css('div.pagination').css('a.older-posts::attr(href)').extract_first()
+    yield scrapy.Request(url=nextPage, callback=self.parse)
+
+
+  def parse_item(self, response):
+    item = NoticiasItem()
+    text = ''
+    item['title'] = response.css('h1.title::text').extract_first()
+
+    d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+    ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
+    if d[-6:] != '-06:00':
+      d = d[:-6] + '-06:00'
+    item['date'] = d
+
+    item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
+
+    for p in response.css('div.post-entry').css('p').extract():
+      text += remove_tags(p)
+    item['text'] = text
+
+    item['url'] = response.url
+
+    # print item['title']
+    yield item
+
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc