Commit fed7af4d authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

new db

parent 13ce817c
{
"model":"/home/mario/git/Guns_N_Roses/utils/BayesClassifier_pos_1000_neg_1250_acc_0.92.pkl",
"documents":"/home/mario/git/Guns_N_Roses/utils/ids_vistos_primera_anot",
"stopwords":"/home/mario/git/Guns_N_Roses/utils/stopwords",
"stop": 500
}
......@@ -28,6 +28,9 @@ class Command(BaseCommand):
categories = dict()
doclist = list()
print(newsList.count())
print(p.num_pages)
categories ={"roses":0, "guns":0}
for i in range (1,p.num_pages):
# print(i)
docs_new = [d for d in p.page(i) if d.text!=""]
......@@ -41,13 +44,13 @@ class Command(BaseCommand):
item['text'] = document
item['id'] = docs_id[ii]
doclist.append(item)
if category not in categories:
categories[category] = 0;
# if category not in categories:
# categories[category] = 0;
categories[category] +=1
ii+=1
finish = True
for c in categories:
finish = finish and categories[c] >=1000
finish = finish and categories[c] >= 5000
if finish:
break
......@@ -59,7 +62,7 @@ class Command(BaseCommand):
print(categories)
with open('data.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2)
print (s)
# print (s)
outfile.write(s)
......
from django.core.management.base import BaseCommand, CommandError
from catalog.models import User, News, Publisher, Topic, audioTime, Search
from django.db.models import Q
import operator
from django.core.paginator import Paginator
import os
#import simplejson as json
import json
from django.db.models import Q
import sys
sys.path.append("/home/mario/git/")
from Guns_N_Roses import M3GunsNRoses
import datetime
#from os import path
import codecs
import nltk
from nltk.stem import SnowballStemmer
from textblob.classifiers import (NaiveBayesClassifier)
import dill as pickle
class Command(BaseCommand):
help = 'Report database'
def add_arguments(self, parser):
# parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0]
parser.add_argument('cfg', nargs=1, type=str)
def handle(self, *args, **options):
cfg = None
with open(options['cfg'][0]) as f:
cfg = json.load(f)
print(cfg['model'])
clf = M3GunsNRoses.load_model(cfg["model"])
stopwords = M3GunsNRoses.file2list(cfg["stopwords"])
documentsOut = M3GunsNRoses.file2list(cfg["documents"])
stemmer = SnowballStemmer('spanish')
pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24]
publishers = Publisher.objects.all().filter(id__in=pub)
myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & ~Q(id__in=documentsOut)
newsList = News.objects.all().filter(myQuery).order_by('id')
p = Paginator(newsList, 50)
print("Classifing...")
categories = dict()
doclist = list()
categories = {"guns":0, "roses":0}
for i in range (1,p.num_pages):
docs_new = [d for d in p.page(i) if d.text!=""]
for doc in docs_new:
text = doc.text.replace("\xa0", " ").replace("\x93", "").replace("\x94", "").replace('"', '').replace("'","")
dist_classes = M3GunsNRoses.NBanalyse(clf, text, stopwords)
item = dict()
item['category'] = max(dist_classes.items(), key=operator.itemgetter(1))[0]
item['probabilities'] = dist_classes
item['text'] = text
item['id'] = doc.id
print(dist_classes)
doclist.append(item)
category=item['category']
categories[category] +=1
print(dist_classes)
if categories["guns"] >= cfg["stop"]:
break
# for l in doclist:
# print(l)
# print
# print(categories)
with open('data.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2)
print (s)
outfile.write(s)
from django.core.management.base import BaseCommand, CommandError
from catalog.models import User, News, Publisher, Topic, audioTime, Search
from django.db.models import Q
import operator
from django.core.paginator import Paginator
import os
#import simplejson as json
import json
from django.db.models import Q
import sys
sys.path.append("/home/mario/git/")
from Guns_N_Roses import M3GunsNRoses
import datetime
#from os import path
import codecs
import nltk
from nltk.stem import SnowballStemmer
from textblob.classifiers import (NaiveBayesClassifier)
import dill as pickle
#-------------------------------------------------------------------------------------------------
def file2list(fname):
with open(fname) as f:
lines = f.readlines()
lines = [l.strip('\n') for l in lines if not l is '']
return lines
#-------------------------------------------------------------------------------------------------
class Command(BaseCommand):
help = 'Report database'
def add_arguments(self, parser):
# parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0]
parser.add_argument('words', nargs=1, type=str)
def handle(self, *args, **options):
wordList = file2list(options['words'][0])
print(list)
#with open(options['word'][0]) as f:
# cfg = json.load(f)
pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24]
publishers = Publisher.objects.all().filter(id__in=pub)
# myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & Q(search_vector__in=wordList)
# newsList = News.objects.all().filter(myQuery).order_by('id')
# print(newsList.count())
ids = set()
for w in wordList:
myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & Q(search_vector=w)
newsList = News.objects.all().filter(myQuery)
for i in newsList:
ids.add(i.id)
myQuery = Q(id__in=ids)
newsList = News.objects.all().filter(myQuery)
print("end")
print(newsList.count())
doclist = [{"text":n.text, "id":n.id, "date":n.date.strftime("%Y-%m-%d"), "url":n.url} for n in newsList]
with open('dataNews.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2)
# print (s)
outfile.write(s)
......@@ -8,10 +8,18 @@ class Command(BaseCommand):
help = 'Report database'
# def add_arguments(self, parser):
# parser.add_argument('basedir', nargs=1, type=str) #(options['basedir'][0]
def add_arguments(self, parser):
parser.add_argument('json', nargs=1, type=str) #(options['basedir'][0]
def handle(self, *args, **options):
data = dict()
data['users'] = User.objects.all().count()
data['publishers'] = Publisher.objects.all().count()
data['text'] = Publisher.objects.all().filter(type="texto").count()
data['audio'] = Publisher.objects.all().filter(type="audio").count()
data['documents'] = News.objects.all().count()
data['searches'] = Search.objects.all().count()
print ( "Users: " + str(User.objects.all().count()) )
print ( "Publishers: " + str(Publisher.objects.all().count()) )
print ( "\tText: " + str(Publisher.objects.all().filter(type="texto").count()) )
......@@ -36,3 +44,6 @@ class Command(BaseCommand):
audioSources = Publisher.objects.all().filter(type="audio")
for a in audioSources:
print(a.name + ", " + a.url + ", " + str(audioTime.objects.all().filter(publisher=a)[0].minutes ))
print(data)
with open(options['json'][0], 'w') as outfile:
json.dump(data, outfile)
......@@ -18,7 +18,7 @@ class Command(BaseCommand):
def handle(self, *args, **options):
if True:
if False:
#update radio stations recotding time
print("Recording Time:")
recordingsDir = "/home/mario/virtualHDD/m3/recordings/"
......@@ -50,14 +50,14 @@ class Command(BaseCommand):
if news.count()>0:
minYear = news[0].date.year
lastDate = news[0].date
yearList = [ int(y) for y in os.listdir('.')]
yearList = sorted([ int(y) for y in os.listdir('.')])
print (yearList)
for y in yearList:
for y in sorted(yearList):
if y >=minYear:
os.chdir(str(y))
print (os.getcwd())
filesList = os.listdir(".")
filesList = sorted(os.listdir("."))
for f in filesList:
fileDate = datetime.datetime.strptime(f[:f.find(".")], "%Y-%m-%d").date()
......
......@@ -145,7 +145,7 @@ def settingsView(request):
else:
form = ProfileForm( initial={'subscriptions':[ v for v in request.user.profile.subscriptions.all().values_list('id', flat=True)]})
news = News.objects.all()
publishersList = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct()))
publishersList = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()))
choice = [ (r.id,r.name) for r in publishersList ]
form.fields['subscriptions'].choices=choice
......@@ -184,7 +184,7 @@ def getNewsByRequest(request):
print (myQuery)
return News.objects.all().filter(myQuery)
return News.objects.filter(myQuery)
#-------------------------------------------------------------------------------
def index(request):
......@@ -217,7 +217,7 @@ def index(request):
info['nPublishers'] = news.values('publisher').distinct().count()
info['nNews'] = news.count()
info['nNewsText'] = Quantity(news.count()).render(prec=3)
info['nAudio'] = Publisher.objects.all().filter(type="audio").count()
info['nAudio'] = Publisher.objects.filter(type="audio").count()
queryset = news.values("publisher").order_by("publisher").annotate(count = Count('publisher') )
......@@ -230,7 +230,7 @@ def index(request):
donutChart = [{"label": Publisher.objects.get(id=q['publisher']).name, "value":q["count"], "url":urlDict[q['publisher']]} for q in queryset]
publishers = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct()))
publishers = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()))
queryset = news.values("publisher").annotate(day=TruncMonth('date') )
......@@ -254,7 +254,7 @@ def publisherList(request, type="all"):
news = getNewsByRequest(request)
publishers = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct()) & typeQuery)
publishers = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()) & typeQuery)
for p in publishers:
......@@ -295,7 +295,7 @@ def wsAudioList(request):
#-------------------------------------------------------------------------------
def wsSearchList(request):
searches = Search.objects.all().filter(user=request.user)
searches = Search.objects.filter(user=request.user)
data = dict()
data['data']=[[s.text, "*" if s is None else s.startDate.strftime('%Y-%m-%d'), "*" if s is None else s.endDate.strftime('%Y-%m-%d'), ','.join([ sub.shortName for sub in s.publishers.all()])] for s in searches]
......@@ -405,7 +405,7 @@ def wsDownloadNews(request):
def audioList(request):
form = SearchForm(request.GET)
publishers = audioTime.objects.all().filter(minutes__gt=0 ) #type="audio")
publishers = audioTime.objects.filter(minutes__gt=0 ) #type="audio")
return render(request,'audioList.html',{"form":form, "publishers":publishers})
#-------------------------------------------------------------------------------
......@@ -413,7 +413,7 @@ def audioPublisher(request, publisher):
form = SearchForm(request.GET)
info=dict()
if Publisher.objects.all().filter(shortName=publisher).count() > 0:
p = Publisher.objects.all().filter(shortName=publisher)[0]
if Publisher.objects.filter(shortName=publisher).count() > 0:
p = Publisher.objects.filter(shortName=publisher)[0]
return render(request,'audioPlay.html',{"form":form, "publisher":p})
......@@ -26,4 +26,5 @@ urlpatterns = [
url(r'^catalog/', include('catalog.urls')),
url(r'^$', RedirectView.as_view(url='/catalog/', permanent=True)),
url(r'^accounts/', include('django.contrib.auth.urls')),
# url(r'^ws/reports/images/(?P<type>\w+)$', views.wsReportImages, name='wsReportImages')
] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment