gunsNroses2.py 2.41 KB
Newer Older
Mario Chirinos Colunga's avatar
Mario Chirinos Colunga committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
from django.core.management.base import BaseCommand, CommandError
from catalog.models import User, News, Publisher, Topic, audioTime, Search
from django.db.models import Q
import operator
from django.core.paginator import Paginator

import os
#import simplejson as json
import json
from django.db.models import Q
import sys
sys.path.append("/home/mario/git/")
from Guns_N_Roses import M3GunsNRoses
import datetime



#from os import path
import codecs
import nltk
from nltk.stem import SnowballStemmer
from textblob.classifiers import (NaiveBayesClassifier)
import dill as pickle
class Command(BaseCommand):
	help = 'Report database'


	def add_arguments(self, parser):
#		parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0]
		parser.add_argument('cfg', nargs=1, type=str) 
	def handle(self, *args, **options):

		cfg = None
		with open(options['cfg'][0]) as f:
    			cfg = json.load(f)
		print(cfg['model'])
		clf = M3GunsNRoses.load_model(cfg["model"])
		stopwords = M3GunsNRoses.file2list(cfg["stopwords"])
		documentsOut = M3GunsNRoses.file2list(cfg["documents"])
		stemmer = SnowballStemmer('spanish')


		pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24]

		publishers = Publisher.objects.all().filter(id__in=pub)
		
		myQuery = Q(publisher__in=publishers) &  Q(date__gte=datetime.date(2014, 1, 1)) & ~Q(id__in=documentsOut)
		newsList = News.objects.all().filter(myQuery).order_by('id')
		p = Paginator(newsList, 50)
		print("Classifing...")
		categories = dict()

		doclist = list()
		categories = {"guns":0, "roses":0}

		for i in range (1,p.num_pages):
			docs_new = [d for d in p.page(i) if d.text!=""]
			for doc in docs_new:
				text = doc.text.replace("\xa0", " ").replace("\x93", "").replace("\x94", "").replace('"', '').replace("'","")
				dist_classes = M3GunsNRoses.NBanalyse(clf, text, stopwords)
				item = dict()
				item['category'] = max(dist_classes.items(), key=operator.itemgetter(1))[0]
				item['probabilities'] = dist_classes
				item['text'] = text
				item['id'] = doc.id

				print(dist_classes)
				doclist.append(item)
				category=item['category']

				categories[category] +=1
				print(dist_classes)
			if categories["guns"] >= cfg["stop"]:
				break
	


		
#		for l in doclist:
#			print(l)
#		print 
#		print(categories)
		with open('data.json', 'w', encoding='utf8') as outfile:
			s = json.dumps(doclist,  ensure_ascii=False, indent=2)
			print (s)
			outfile.write(s)