Commit 605abb9c authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent d3103297
...@@ -13,13 +13,10 @@ TAG_RE = re.compile(r'<[^>]+>') ...@@ -13,13 +13,10 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?') DATE_RE1 = re.compile(r'(-|- )?([A-Z][a-z]+, ?)?[A-Z][a-z]+( \d{1,2})?, \d{4}( -|\n)? ?[A-Z]')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?') DATE_RE2 = re.compile(r', [a-zA-Z]+ \d{1,2} -( -)?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I) AUTH_RE = re.compile(r'\n(- )?By.+\n')
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class importantData(scrapy.Item): class importantData(scrapy.Item):
...@@ -90,19 +87,30 @@ class QuotesSpider(scrapy.Spider): ...@@ -90,19 +87,30 @@ class QuotesSpider(scrapy.Spider):
for p in response.xpath('//div[@class="entry"]').css('p').extract(): for p in response.xpath('//div[@class="entry"]').css('p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
# result = LOC_RE.search(text) text = text.strip()
# if result: text = "\n" + text
# m = result.group(0) text = text.replace(u'\u2013', "-")
# location = G_RE.sub('', m).strip() text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
# if len(location) <= 35:
# item['location'] = location res = DATE_RE1.search(text)
# text = text[text.find(m)+len(m):] if res:
m = res.group(0)[:-1]
# text = EM_RE.sub('', text) text = text[text.find(m) + len(m):].strip()
# text = TW_RE.sub('', text) text = "\n" + text
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text) res = DATE_RE2.search(text)
# text = TAG3_RE.sub('', text) if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = AUTH_RE.match(text)
if res:
m = res.group(0)
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text = text.replace("Follow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", '')
item['text'] = text.strip() item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment