date parser

6d213ea2 · Renán Sosa Guillen · b59503d9 · 6d213ea2
Commit 6d213ea2 authored Mar 12, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 12 deletions

parse_date_files.py parse_date_files.py +55 -12

No files found.
--- a/parse_date_files.py
+++ b/parse_date_files.py
 import json, os, sys
 from datetime import datetime
+from collections import OrderedDict
 """
 Uso:
 python parse_date_files.py <nombre_del_crawler>
 Ej.
-python parse_date_files.py laJornadaBC2
+python parse_date_files.py descarga_hacia_atras/laJornadaBC2
 """
+def dictRowGenerator(line):
+    row = []
+    try:
+        row.append(("date", line['date']))
+    except:
+        pass
+    try:
+        row.append(("topic", line['topic']))
+    except:
+        pass
+    try:
+        row.append(("title", line['title']))
+    except:
+        pass
+    try:
+        row.append(("author", line['author']))
+    except:
+        pass
+    try:
+        row.append(("location", line['location']))
+    except:
+        pass
+    try:
+        row.append(("text", line['text']))
+    except:
+        pass
+    try:
+        row.append(("url", line['url']))
+    except:
+        pass
+    return row
+info = sys.argv[1]
+media = info[info.rfind("/") + 1:]
+download_type = info[:info.rfind("/")]
 this_file_path = os.path.dirname(os.path.realpath(__file__))
-json_file_path = this_file_path+'/descarga_hacia_atras/'+sys.argv[1]
+json_file_path = this_file_path + "/" + download_type + "/" + media
-destination_path = this_file_path+'/'+sys.argv[1]
+destination_path = this_file_path + "/" + media
-json_file = json.loads(open(json_file_path+'/noticias.json').read())
+json_file = json.loads(open(json_file_path + "/noticias.json").read())
 date_set = set()
 for news in json_file:
    if news['date'] is not None:
-        news_date = news['date'][:news['date'].rfind('T')]
+        news_date = news['date'][:news['date'].rfind("T")]
        if len(news_date) > 10:
            news_date = news['date'][:news['date'].rfind(' ')]
        if not news_date in date_set:
            date_set.add(news_date)
            print(news_date)
+            urlSet = set()
            try:
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
+                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
            except:
-                os.makedirs(destination_path+'/'+news_date[:4])
+                os.makedirs(destination_path + "/" + news_date[:4])
-                export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a')
+                export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
            counter = 0
            export_file.write("[")
            for line in json_file:
                if line['date'] is not None:
-                    line_date = line['date'][:line['date'].rfind('T')]
+                    line_date = line['date'][:line['date'].rfind("T")]
                    if len(line_date) > 10:
                        line_date = line['date'][:line['date'].rfind(' ')]
-                    if line_date == news_date:
+                    if not line['url'] in urlSet and line_date == news_date:
+                        urlSet.add(line['url'])
                        counter += 1
+                        auxRow = dictRowGenerator(line)
+                        row = OrderedDict(auxRow)
                        if counter == 1:
-                            export_file.write(json.dumps(line))
+                            export_file.write(json.dumps(row))
                        elif counter > 1:
-                            export_file.write(",\n" + json.dumps(line))
+                            export_file.write(",\n" + json.dumps(row))
            export_file.write("]")
            export_file.close()