Commit 6d213ea2 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

date parser

parent b59503d9
import json, os, sys import json, os, sys
from datetime import datetime from datetime import datetime
from collections import OrderedDict
""" """
Uso: Uso:
python parse_date_files.py <nombre_del_crawler> python parse_date_files.py <nombre_del_crawler>
Ej. Ej.
python parse_date_files.py laJornadaBC2 python parse_date_files.py descarga_hacia_atras/laJornadaBC2
""" """
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
info = sys.argv[1]
media = info[info.rfind("/") + 1:]
download_type = info[:info.rfind("/")]
this_file_path = os.path.dirname(os.path.realpath(__file__)) this_file_path = os.path.dirname(os.path.realpath(__file__))
json_file_path = this_file_path+'/descarga_hacia_atras/'+sys.argv[1] json_file_path = this_file_path + "/" + download_type + "/" + media
destination_path = this_file_path+'/'+sys.argv[1] destination_path = this_file_path + "/" + media
json_file = json.loads(open(json_file_path+'/noticias.json').read()) json_file = json.loads(open(json_file_path + "/noticias.json").read())
date_set = set() date_set = set()
for news in json_file: for news in json_file:
if news['date'] is not None: if news['date'] is not None:
news_date = news['date'][:news['date'].rfind('T')] news_date = news['date'][:news['date'].rfind("T")]
if len(news_date) > 10: if len(news_date) > 10:
news_date = news['date'][:news['date'].rfind(' ')] news_date = news['date'][:news['date'].rfind(' ')]
if not news_date in date_set: if not news_date in date_set:
date_set.add(news_date) date_set.add(news_date)
print(news_date) print(news_date)
urlSet = set()
try: try:
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a') export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
except: except:
os.makedirs(destination_path+'/'+news_date[:4]) os.makedirs(destination_path + "/" + news_date[:4])
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a') export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
counter = 0 counter = 0
export_file.write("[") export_file.write("[")
for line in json_file: for line in json_file:
if line['date'] is not None: if line['date'] is not None:
line_date = line['date'][:line['date'].rfind('T')] line_date = line['date'][:line['date'].rfind("T")]
if len(line_date) > 10: if len(line_date) > 10:
line_date = line['date'][:line['date'].rfind(' ')] line_date = line['date'][:line['date'].rfind(' ')]
if line_date == news_date: if not line['url'] in urlSet and line_date == news_date:
urlSet.add(line['url'])
counter += 1 counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1: if counter == 1:
export_file.write(json.dumps(line)) export_file.write(json.dumps(row))
elif counter > 1: elif counter > 1:
export_file.write(",\n" + json.dumps(line)) export_file.write(",\n" + json.dumps(row))
export_file.write("]") export_file.write("]")
export_file.close() export_file.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment