Commit 20e53f8f authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 15cf2c16
......@@ -21,6 +21,8 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field()
......@@ -45,9 +47,9 @@ class QuotesSpider(scrapy.Spider):
baseURL = "http://www.elheraldo.hn/"
# sectionList = ["tegucigalpa", "economia", "opinion", "revistas/crimenes",
# "pais", "sucesos", "deportes", "entretenimiento"]
sectionList = ["tegucigalpa"]
sectionList = ["tegucigalpa", "economia", "mundo", "revistas/crimenes",
"pais", "sucesos", "deportes", "entretenimiento"]
# sectionList = ["tegucigalpa"]
if self.stopDate is None:
for s in sectionList:
......@@ -68,6 +70,7 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
print response.url
searchData = response.meta['item']
CONTINUE_SEARCHING = True
linkSet = set()
......@@ -75,6 +78,8 @@ class QuotesSpider(scrapy.Spider):
searchData['section_url'] = response.url
linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//section[@id="principal"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
linkSet.remove(searchData['section_url'])
......@@ -85,6 +90,8 @@ class QuotesSpider(scrapy.Spider):
try:
linkSet.remove(searchData['section_url'])
except KeyError:
pass
if len(linkSet) <= 0:
CONTINUE_SEARCHING = False
for link in linkSet:
......@@ -106,20 +113,26 @@ class QuotesSpider(scrapy.Spider):
if not CONTINUE_SEARCHING:
if searchData['page'] == 1:
searchData['section_url'] = response.url
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
entrySet.remove(searchData['section_url'])
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
linkSet.remove(searchData['section_url'])
linkSet.union(entrySet)
linkList = list(linkSet)
linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
linkList.remove(searchData['section_url'])
else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
try:
linkSet.remove(searchData['section_url'])
linkList = list(linkSet)
linkList.remove(searchData['section_url'])
except KeyError:
linkList = []
pass
newsList = []
for link in linkList:
if not link in newsList:
newsList.append(link)
for link in linkList:
info = ImportantData()
......@@ -144,10 +157,11 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
d = response.xpath('//time/text()').extract_first()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = newsData['datePublished'][:-1]
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
try:
......@@ -156,7 +170,15 @@ class QuotesSpider(scrapy.Spider):
topic = None
item['topic'] = topic
item['text'] = newsData['articleBody']
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0') + 1:]
item['text'] = text
item['url'] = response.url
......@@ -164,18 +186,16 @@ class QuotesSpider(scrapy.Spider):
def parse_item_with_stop_date(self, response):
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
d = newsData['datePublished']
d = d[:d.find("T")]
dt = datetime.strptime(d, '%Y-%m-%d').date()
d = response.xpath('//time/text()').extract_first()
dt = datetime.strptime(d, '%d.%m.%Y').date()
if dt >= self.stopDate:
info = response.meta['item']
item = NoticiasItem()
text = ''
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = newsData['datePublished'][:-1]
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
try:
......@@ -184,7 +204,15 @@ class QuotesSpider(scrapy.Spider):
topic = None
item['topic'] = topic
item['text'] = newsData['articleBody']
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0')+1:]
item['text'] = text
item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment