Commit 19792a13 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 1033155b
......@@ -22,7 +22,6 @@ def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'-\d{8}-')
RE = re.compile(r'\n\xa0')
class ImportantData(scrapy.Item):
......@@ -150,9 +149,11 @@ class QuotesSpider(scrapy.Spider):
if text == '':
t = remove_tags(response.xpath('//div[@class="news-body"]').extract_first())
res = RE.search(t)
if res:
text = t[:t.rfind(res.group(0))]
mod_tags = remove_tags(response.xpath('//section[@class="mod mod-tags"]').extract_first())
mod_list = remove_tags(response.xpath('//section[@class="mod mod-list"]').extract_first())
mod_comments = remove_tags(response.xpath('//section[@class="mod mod-comments"]').extract_first())
mod_outbrain = remove_tags(response.xpath('//div[@class="mod mod-outbrain"]').extract_first())
text = t.replace(mod_tags, '').replace(mod_list, '').replace(mod_comments, '').replace(mod_outbrain, '')
item['text'] = text.strip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment