Commit 75add0f4 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge foraneos

parents 2c6cb6c3 19792a13
...@@ -22,7 +22,6 @@ def remove_tags(text): ...@@ -22,7 +22,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
DAT_RE = re.compile(r'-\d{8}-') DAT_RE = re.compile(r'-\d{8}-')
RE = re.compile(r'\n\xa0')
class ImportantData(scrapy.Item): class ImportantData(scrapy.Item):
...@@ -150,9 +149,11 @@ class QuotesSpider(scrapy.Spider): ...@@ -150,9 +149,11 @@ class QuotesSpider(scrapy.Spider):
if text == '': if text == '':
t = remove_tags(response.xpath('//div[@class="news-body"]').extract_first()) t = remove_tags(response.xpath('//div[@class="news-body"]').extract_first())
res = RE.search(t) mod_tags = remove_tags(response.xpath('//section[@class="mod mod-tags"]').extract_first())
if res: mod_list = remove_tags(response.xpath('//section[@class="mod mod-list"]').extract_first())
text = t[:t.rfind(res.group(0))] mod_comments = remove_tags(response.xpath('//section[@class="mod mod-comments"]').extract_first())
mod_outbrain = remove_tags(response.xpath('//div[@class="mod mod-outbrain"]').extract_first())
text = t.replace(mod_tags, '').replace(mod_list, '').replace(mod_comments, '').replace(mod_outbrain, '')
item['text'] = text.strip() item['text'] = text.strip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment