Skip to content

Commit

Permalink
Merge pull request #214 from bab2min/dev_viewer
Browse files Browse the repository at this point in the history
Simple Topic Model Viewer
  • Loading branch information
bab2min authored Dec 18, 2023
2 parents 425197e + 64b90b7 commit a1a2a8b
Show file tree
Hide file tree
Showing 9 changed files with 860 additions and 40 deletions.
47 changes: 47 additions & 0 deletions src/python/py_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,53 @@ void char2Byte(const tomoto::SharedString& str, vector<uint32_t>& startPos, vect
return char2Byte(str.begin(), str.end(), startPos, length);
}

void byte2Char(const char* strBegin, const char* strEnd, vector<uint32_t>& startPos, vector<uint16_t>& length)
{
if (strBegin == strEnd) return;
vector<size_t> charPos;
auto it = strBegin;
for (; it != strEnd; )
{
charPos.emplace_back(it - strBegin);
uint8_t c = *it;
if ((c & 0xF8) == 0xF0)
{
it += 4;
}
else if ((c & 0xF0) == 0xE0)
{
it += 3;
}
else if ((c & 0xE0) == 0xC0)
{
it += 2;
}
else if ((c & 0x80))
{
throw std::runtime_error{ "utf-8 decoding error" };
}
else it += 1;
}
charPos.emplace_back(strEnd - strBegin);

for (size_t i = 0; i < startPos.size(); ++i)
{
size_t s = startPos[i], e = (size_t)startPos[i] + length[i];
startPos[i] = std::lower_bound(charPos.begin(), charPos.end(), s) - charPos.begin();
length[i] = std::lower_bound(charPos.begin(), charPos.end(), e) - charPos.begin() - startPos[i];
}
}

void byte2Char(const string& str, vector<uint32_t>& startPos, vector<uint16_t>& length)
{
return byte2Char(&str[0], &str[0] + str.size(), startPos, length);
}

void byte2Char(const tomoto::SharedString& str, vector<uint32_t>& startPos, vector<uint16_t>& length)
{
return byte2Char(str.begin(), str.end(), startPos, length);
}

void TopicModelObject::dealloc(TopicModelObject* self)
{
DEBUG_LOG("TopicModelObject Dealloc " << self);
Expand Down
127 changes: 117 additions & 10 deletions src/python/py_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

using namespace std;

void byte2Char(const tomoto::SharedString& str, vector<uint32_t>& startPos, vector<uint16_t>& length);

namespace py
{
template<>
Expand Down Expand Up @@ -390,10 +392,14 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
PyObject* pos = PyTuple_GetItem(t, 1);
PyObject* len = PyTuple_GetItem(t, 2);
if (!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };

py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
if (!stopRet) throw py::ExcPropagation{};
doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
bool isStopword = false;
if (stopwords != Py_None)
{
py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
if (!stopRet) throw py::ExcPropagation{};
isStopword = PyObject_IsTrue(stopRet);
}
doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
doc.origWordPos.emplace_back(PyLong_AsLong(pos));
doc.origWordLen.emplace_back(PyLong_AsLong(len));
}
Expand All @@ -410,9 +416,14 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
if (!PyObject_IsTrue(words)) return py::buildPyValue(-1);
py::foreach<string>(words, [&](const string& w)
{
py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(w) }) };
if (!stopRet) throw py::ExcPropagation{};
doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(w));
bool isStopword = false;
if (stopwords != Py_None)
{
py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(w) }) };
if (!stopRet) throw py::ExcPropagation{};
isStopword = PyObject_IsTrue(stopRet);
}
doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(w));
}, "");
}
PyObject* key, * value;
Expand Down Expand Up @@ -447,6 +458,97 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
});
}


PyObject* CorpusObject::addDocs(CorpusObject* self, PyObject* args, PyObject* kwargs)
{
return py::handleExc([&]()
{
if (!self->isIndependent())
throw py::RuntimeError{ "Cannot modify the corpus bound to a topic model." };
if (PyTuple_Size(args) != 3) throw py::ValueError{ "function takes 1 positional arguments." };
PyObject* tokenized_iter = PyTuple_GetItem(args, 0);
PyObject* raw_iter = PyTuple_GetItem(args, 1);
PyObject* metadata_iter = PyTuple_GetItem(args, 2);

size_t cnt = 0;

py::UniqueObj stopwords{ PyObject_GetAttrString((PyObject*)self, "_stopwords") };

py::foreach<PyObject*>(tokenized_iter, [&](PyObject* tokenized)
{
tomoto::RawDoc doc;
py::foreach<PyObject*>(tokenized, [&](PyObject* t)
{
if (PyUnicode_Check(t))
{
doc.words.emplace_back(self->vocab->vocabs->add(PyUnicode_AsUTF8(t)));
}
else if (PyTuple_Size(t) == 3)
{
PyObject* word = PyTuple_GetItem(t, 0);
PyObject* pos = PyTuple_GetItem(t, 1);
PyObject* len = PyTuple_GetItem(t, 2);
if (!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };

bool isStopword = false;
if (stopwords != Py_None)
{
py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
if (!stopRet) throw py::ExcPropagation{};
isStopword = PyObject_IsTrue(stopRet);
}
doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
doc.origWordPos.emplace_back(PyLong_AsLong(pos));
doc.origWordLen.emplace_back(PyLong_AsLong(len));
}
else
{
throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };
}
}, "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`).");
py::UniqueObj raw{ PyIter_Next(raw_iter) };
if (!raw) throw py::ExcPropagation{};
py::UniqueObj metadata{ PyIter_Next(metadata_iter) };
if (!metadata) throw py::ExcPropagation{};

doc.rawStr = tomoto::SharedString{ PyUnicode_AsUTF8(raw) };

PyObject* key, * value;
Py_ssize_t p = 0;
while (PyDict_Next(metadata, &p, &key, &value))
{
const char* utf8 = PyUnicode_AsUTF8(key);
if (utf8 == string{ "uid" })
{
if (value == Py_None) continue;
const char* uid = PyUnicode_AsUTF8(value);
if (!uid) throw py::ValueError{ "`uid` must be str type." };
string suid = uid;
if (suid.empty()) throw py::ValueError{ "wrong `uid` value : empty str not allowed" };
if (self->invmap.find(suid) != self->invmap.end())
{
throw py::ValueError{ "there is a document with uid = " + py::repr(value) + " already." };
}
self->invmap.emplace(suid, self->docs.size());
doc.docUid = tomoto::SharedString{ uid };
continue;
}

Py_INCREF(value);
doc.misc[utf8] = std::shared_ptr<void>{ value, [](void* p)
{
Py_XDECREF(p);
} };
}
self->docs.emplace_back(move(doc));
cnt++;
}, "");

return py::buildPyValue(cnt);
});
}


PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs)
{
size_t minCf = 10, minDf = 5, maxLen = 5, maxCand = 5000;
Expand Down Expand Up @@ -797,6 +899,7 @@ static PyMethodDef UtilsCorpus_methods[] =
{ "__getstate__", (PyCFunction)CorpusObject::getstate, METH_NOARGS, "" },
{ "__setstate__", (PyCFunction)CorpusObject::setstate, METH_VARARGS, "" },
{ "add_doc", (PyCFunction)CorpusObject::addDoc, METH_VARARGS | METH_KEYWORDS, "" },
{ "add_docs", (PyCFunction)CorpusObject::addDocs, METH_VARARGS | METH_KEYWORDS, "" },
{ "extract_ngrams", (PyCFunction)CorpusObject::extractNgrams, METH_VARARGS | METH_KEYWORDS, "" },
{ "concat_ngrams", (PyCFunction)CorpusObject::concatNgrams, METH_VARARGS | METH_KEYWORDS, "" },
{ nullptr }
Expand Down Expand Up @@ -1025,10 +1128,14 @@ PyObject* DocumentObject::getSpan(DocumentObject* self, void* closure)
{
return py::handleExc([&]()
{
PyObject* ret = PyList_New(self->doc->origWordPos.size());
for (size_t i = 0; i < self->doc->origWordPos.size(); ++i)
auto starts = self->doc->origWordPos;
auto lengthes = self->doc->origWordLen;
byte2Char(self->doc->rawStr, starts, lengthes);

PyObject* ret = PyList_New(starts.size());
for (size_t i = 0; i < starts.size(); ++i)
{
size_t begin = self->doc->origWordPos[i], end = begin + self->doc->origWordLen[i];
size_t begin = starts[i], end = begin + lengthes[i];
PyList_SET_ITEM(ret, i, py::buildPyTuple(begin, end));
}
return ret;
Expand Down
1 change: 1 addition & 0 deletions src/python/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ struct CorpusObject
static PyObject* getstate(CorpusObject* self, PyObject*);
static PyObject* setstate(CorpusObject* self, PyObject* args);
static PyObject* addDoc(CorpusObject* self, PyObject* args, PyObject* kwargs);
static PyObject* addDocs(CorpusObject* self, PyObject* args, PyObject* kwargs);
static PyObject* extractNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs);
static PyObject* concatNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs);
static Py_ssize_t len(CorpusObject* self);
Expand Down
1 change: 1 addition & 0 deletions tomotopy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class ParallelScheme(IntEnum):
import tomotopy.utils as utils
import tomotopy.coherence as coherence
import tomotopy.label as label
import tomotopy.viewer as viewer

import os
if os.environ.get('TOMOTOPY_LANG') == 'kr':
Expand Down
25 changes: 20 additions & 5 deletions tomotopy/_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,21 @@ def topics_info_DTModel(mdl, file, topic_word_top_n):
words = ' '.join(w for w, _ in mdl.get_topic_words(k, t, top_n=topic_word_top_n))
print('| t={} ({}) : {}'.format(t, topic_cnt[t, k], words), file=file)

def basic_info(mdl, file=None):
call_method_bound(mdl, 'basic_info', globals(), file=file)

def training_info(mdl, file=None):
call_method_bound(mdl, 'training_info', globals(), file=file)

def initial_params_info(mdl, file=None):
call_method_bound(mdl, 'initial_params_info', globals(), file=file)

def params_info(mdl, file=None):
call_method_bound(mdl, 'params_info', globals(), file=file)

def topics_info(mdl, file=None, topic_word_top_n=5):
call_method_bound(mdl, 'topics_info', globals(), file=file, topic_word_top_n=topic_word_top_n)

def summary(mdl, initial_hp=True, params=True, topic_word_top_n=5, file=None, flush=False):
import tomotopy as tp
import numpy as np
Expand All @@ -325,25 +340,25 @@ def summary(mdl, initial_hp=True, params=True, topic_word_top_n=5, file=None, fl
flush = flush or False

print('<Basic Info>', file=file)
call_method_bound(mdl, 'basic_info', globals(), file=file)
basic_info(mdl, file=file)
print('|', file=file)
print('<Training Info>', file=file)
call_method_bound(mdl, 'training_info', globals(), file=file)
training_info(mdl, file=file)
print('|', file=file)

if initial_hp:
print('<Initial Parameters>', file=file)
call_method_bound(mdl, 'initial_params_info', globals(), file=file)
initial_params_info(mdl, file=file)
print('|', file=file)

if params:
print('<Parameters>', file=file)
call_method_bound(mdl, 'params_info', globals(), file=file)
params_info(mdl, file=file)
print('|', file=file)

if topic_word_top_n > 0:
print('<Topics>', file=file)
call_method_bound(mdl, 'topics_info', globals(), file=file, topic_word_top_n=topic_word_top_n)
topics_info(mdl, file=file, topic_word_top_n=topic_word_top_n)
print('|', file=file)

print(file=file, flush=flush)
Loading

0 comments on commit a1a2a8b

Please sign in to comment.