Commit 605abb9c authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent d3103297
......@@ -13,13 +13,10 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
DATE_RE1 = re.compile(r'(-|- )?([A-Z][a-z]+, ?)?[A-Z][a-z]+( \d{1,2})?, \d{4}( -|\n)? ?[A-Z]')
DATE_RE2 = re.compile(r', [a-zA-Z]+ \d{1,2} -( -)?')
AUTH_RE = re.compile(r'\n(- )?By.+\n')
class importantData(scrapy.Item):
......@@ -90,19 +87,30 @@ class QuotesSpider(scrapy.Spider):
for p in response.xpath('//div[@class="entry"]').css('p').extract():
text += remove_tags(p) + "\n"
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = DATE_RE1.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = DATE_RE2.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = AUTH_RE.match(text)
if res:
m = res.group(0)
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text = text.replace("Follow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", '')
item['text'] = text.strip()
item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment