Commit 6d213ea2 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

date parser

parent b59503d9
import json, os, sys
from datetime import datetime
from collections import OrderedDict
"""
Uso:
python parse_date_files.py <nombre_del_crawler>
Ej.
python parse_date_files.py laJornadaBC2
python parse_date_files.py descarga_hacia_atras/laJornadaBC2
"""
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
info = sys.argv[1]
media = info[info.rfind("/") + 1:]
download_type = info[:info.rfind("/")]
this_file_path = os.path.dirname(os.path.realpath(__file__))
json_file_path = this_file_path+'/descarga_hacia_atras/'+sys.argv[1]
destination_path = this_file_path+'/'+sys.argv[1]
json_file_path = this_file_path + "/" + download_type + "/" + media
destination_path = this_file_path + "/" + media
json_file = json.loads(open(json_file_path+'/noticias.json').read())
json_file = json.loads(open(json_file_path + "/noticias.json").read())
date_set = set()
for news in json_file:
if news['date'] is not None:
news_date = news['date'][:news['date'].rfind('T')]
news_date = news['date'][:news['date'].rfind("T")]
if len(news_date) > 10:
news_date = news['date'][:news['date'].rfind(' ')]
if not news_date in date_set:
date_set.add(news_date)
print(news_date)
urlSet = set()
try:
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
except:
os.makedirs(destination_path+'/'+news_date[:4])
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
os.makedirs(destination_path + "/" + news_date[:4])
export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
counter = 0
export_file.write("[")
for line in json_file:
if line['date'] is not None:
line_date = line['date'][:line['date'].rfind('T')]
line_date = line['date'][:line['date'].rfind("T")]
if len(line_date) > 10:
line_date = line['date'][:line['date'].rfind(' ')]
if line_date == news_date:
if not line['url'] in urlSet and line_date == news_date:
urlSet.add(line['url'])
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
export_file.write(json.dumps(line))
export_file.write(json.dumps(row))
elif counter > 1:
export_file.write(",\n" + json.dumps(line))
export_file.write(",\n" + json.dumps(row))
export_file.write("]")
export_file.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment