Commit 57f362cd authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

el sur

parent eac90030
......@@ -8,7 +8,7 @@
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
FEED_EXPORT_ENCODING = 'utf-8'
BOT_NAME = 'elSur'
SPIDER_MODULES = ['elSur.spiders']
......
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elSur.items import NoticiasItem
#TAG_RE = re.compile(r'<[^>]+>')
#HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
#HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
print(self.baseURL)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
print(response)
## for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
for link in set(response.css('#post-list').css("li > div > div > a::attr(href)").extract():
print(link)
yield scrapy.Request(url=link, callback=self.parse_item)
# pag_lst = response.css('div.dslc-pagination > ul > li')
# if len(pag_lst) > 0:
# del pag_lst[0]
# del pag_lst[0]
# next_page = None
# for li_obj in pag_lst:
# li = remove_tags(li_obj.extract())
# if not li.isdigit():
# next_page = li_obj.xpath('./a/@href').extract_first()
# break
#
# if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
print(response)
item = NoticiasItem()
# text = ''
# news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
# title = response.css('div.dslc-tp-title > h1').extract_first()
# if title is not None : title = remove_tags(title)
# topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
# if topic is not None : topic = remove_tags(topic)
# for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
# p = remove_tags(p)
# text += p + "\n"
# dateline = response.css('span.dateline').extract_first()
# if dateline is not None:
# dateline = remove_tags(dateline)
# text = text.replace(dateline, '')
# text = text.replace(u'\u00a0', ' ')
# text = HEAD_RE_1.sub('', text)
# text = HEAD_RE_2.sub('', text)
# ## News item info ##
# item['date'] = news_date
# item['title'] = title
# item['topic'] = topic
# item['text'] = text.strip()
# item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
"""
MEDIA:
MEDIA:
El Sur, Guerrero
USAGE:
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elSur.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
#from dateutil.parser import parse
from dateparser import parse
from datetime import datetime
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
......@@ -35,62 +26,39 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, "month", None)
day = getattr(self, "day", None)
baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
print(self.baseURL)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
pag_lst = response.css('div.dslc-pagination > ul > li')
if len(pag_lst) > 0:
del pag_lst[0]
del pag_lst[0]
next_page = None
for li_obj in pag_lst:
li = remove_tags(li_obj.extract())
if not li.isdigit():
next_page = li_obj.xpath('./a/@href').extract_first()
break
if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
print(response)
for link in response.css('#post-list').css("li > div > div > a::attr(href)").extract():
print(link)
yield scrapy.Request(url=link, callback=self.parse_item)
if len(response.css(".paging-navigation > a::attr(href)").extract())>0:
next_page = response.css(".paging-navigation > a::attr(href)").extract()[0]
print(next_page)
if next_page is not None: yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('div.dslc-tp-title > h1').extract_first()
if title is not None : title = remove_tags(title)
print(response.encoding)
date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//meta[@property="og:title"]/@content').extract_first().lower()
topic = response.xpath('//meta[@property="article:section"]/@content').extract_first().lower()
text = ""
for p in response.css("div.xt-post-content > p::text").extract():
text+= p.replace("\n", "")+"\n"
topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
p = remove_tags(p)
text += p + "\n"
dateline = response.css('span.dateline').extract_first()
if dateline is not None:
dateline = remove_tags(dateline)
text = text.replace(dateline, '')
text = text.replace(u'\u00a0', ' ')
text = HEAD_RE_1.sub('', text)
text = HEAD_RE_2.sub('', text)
## News item info ##
item['date'] = news_date
item = NoticiasItem()
item['date'] = datetime.fromtimestamp(int(date)).isoformat()
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['text'] = text
item['url'] = response.url
if item["text"] !="":
print(item)
yield item
else:
yield
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment