from django.core.management.base import BaseCommand, CommandError from catalog.models import User, News, Publisher, Topic, audioTime, Search from django.db.models import Q import operator from django.core.paginator import Paginator import os #import simplejson as json import json from django.db.models import Q import sys sys.path.append("/home/mario/git/") from Guns_N_Roses import M3GunsNRoses import datetime #from os import path import codecs import nltk from nltk.stem import SnowballStemmer from textblob.classifiers import (NaiveBayesClassifier) import dill as pickle class Command(BaseCommand): help = 'Report database' def add_arguments(self, parser): # parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0] parser.add_argument('cfg', nargs=1, type=str) def handle(self, *args, **options): cfg = None with open(options['cfg'][0]) as f: cfg = json.load(f) print(cfg['model']) clf = M3GunsNRoses.load_model(cfg["model"]) stopwords = M3GunsNRoses.file2list(cfg["stopwords"]) documentsOut = M3GunsNRoses.file2list(cfg["documents"]) stemmer = SnowballStemmer('spanish') pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24] publishers = Publisher.objects.all().filter(id__in=pub) myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & ~Q(id__in=documentsOut) newsList = News.objects.all().filter(myQuery).order_by('id') p = Paginator(newsList, 50) print("Classifing...") categories = dict() doclist = list() categories = {"guns":0, "roses":0} for i in range (1,p.num_pages): docs_new = [d for d in p.page(i) if d.text!=""] for doc in docs_new: text = doc.text.replace("\xa0", " ").replace("\x93", "").replace("\x94", "").replace('"', '').replace("'","") dist_classes = M3GunsNRoses.NBanalyse(clf, text, stopwords) item = dict() item['category'] = max(dist_classes.items(), key=operator.itemgetter(1))[0] item['probabilities'] = dist_classes item['text'] = text item['id'] = doc.id print(dist_classes) doclist.append(item) category=item['category'] categories[category] +=1 print(dist_classes) if categories["guns"] >= cfg["stop"]: break # for l in doclist: # print(l) # print # print(categories) with open('data.json', 'w', encoding='utf8') as outfile: s = json.dumps(doclist, ensure_ascii=False, indent=2) print (s) outfile.write(s)