Merge pull request #214 from bab2min/dev_viewer

Simple Topic Model Viewer
bab2min · Dec 18, 2023 · a1a2a8b · a1a2a8b
2 parents 425197e + 64b90b7
commit a1a2a8b
Show file tree

Hide file tree

Showing 9 changed files with 860 additions and 40 deletions.
diff --git a/src/python/py_main.cpp b/src/python/py_main.cpp
@@ -64,6 +64,53 @@ void char2Byte(const tomoto::SharedString& str, vector<uint32_t>& startPos, vect
 	return char2Byte(str.begin(), str.end(), startPos, length);
 }
 
+void byte2Char(const char* strBegin, const char* strEnd, vector<uint32_t>& startPos, vector<uint16_t>& length)
+{
+	if (strBegin == strEnd) return;
+	vector<size_t> charPos;
+	auto it = strBegin;
+	for (; it != strEnd; )
+	{
+		charPos.emplace_back(it - strBegin);
+		uint8_t c = *it;
+		if ((c & 0xF8) == 0xF0)
+		{
+			it += 4;
+		}
+		else if ((c & 0xF0) == 0xE0)
+		{
+			it += 3;
+		}
+		else if ((c & 0xE0) == 0xC0)
+		{
+			it += 2;
+		}
+		else if ((c & 0x80))
+		{
+			throw std::runtime_error{ "utf-8 decoding error" };
+		}
+		else it += 1;
+	}
+	charPos.emplace_back(strEnd - strBegin);
+
+	for (size_t i = 0; i < startPos.size(); ++i)
+	{
+		size_t s = startPos[i], e = (size_t)startPos[i] + length[i];
+		startPos[i] = std::lower_bound(charPos.begin(), charPos.end(), s) - charPos.begin();
+		length[i] = std::lower_bound(charPos.begin(), charPos.end(), e) - charPos.begin() - startPos[i];
+	}
+}
+
+void byte2Char(const string& str, vector<uint32_t>& startPos, vector<uint16_t>& length)
+{
+	return byte2Char(&str[0], &str[0] + str.size(), startPos, length);
+}
+
+void byte2Char(const tomoto::SharedString& str, vector<uint32_t>& startPos, vector<uint16_t>& length)
+{
+	return byte2Char(str.begin(), str.end(), startPos, length);
+}
+
 void TopicModelObject::dealloc(TopicModelObject* self)
 {
 	DEBUG_LOG("TopicModelObject Dealloc " << self);

diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp
@@ -7,6 +7,8 @@
 
 using namespace std;
 
+void byte2Char(const tomoto::SharedString& str, vector<uint32_t>& startPos, vector<uint16_t>& length);
+
 namespace py
 {
 	template<>
@@ -390,10 +392,14 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
 					PyObject* pos = PyTuple_GetItem(t, 1);
 					PyObject* len = PyTuple_GetItem(t, 2);
 					if (!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };
-
-					py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
-					if (!stopRet) throw py::ExcPropagation{};
-					doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
+					bool isStopword = false;
+					if (stopwords != Py_None)
+					{
+						py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
+						if (!stopRet) throw py::ExcPropagation{};
+						isStopword = PyObject_IsTrue(stopRet);
+					}
+					doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
 					doc.origWordPos.emplace_back(PyLong_AsLong(pos));
 					doc.origWordLen.emplace_back(PyLong_AsLong(len));
 				}
@@ -410,9 +416,14 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
 			if (!PyObject_IsTrue(words)) return py::buildPyValue(-1);
 			py::foreach<string>(words, [&](const string& w)
 			{
-				py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(w) }) };
-				if (!stopRet) throw py::ExcPropagation{};
-				doc.words.emplace_back(PyObject_IsTrue(stopRet) ? -1 : self->vocab->vocabs->add(w));
+				bool isStopword = false;
+				if (stopwords != Py_None)
+				{
+					py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(w) }) };
+					if (!stopRet) throw py::ExcPropagation{};
+					isStopword = PyObject_IsTrue(stopRet);
+				}
+				doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(w));
 			}, "");
 		}
 		PyObject* key, * value;
@@ -447,6 +458,97 @@ PyObject* CorpusObject::addDoc(CorpusObject* self, PyObject* args, PyObject* kwa
 	});
 }
 
+
+PyObject* CorpusObject::addDocs(CorpusObject* self, PyObject* args, PyObject* kwargs)
+{
+	return py::handleExc([&]()
+	{
+		if (!self->isIndependent())
+			throw py::RuntimeError{ "Cannot modify the corpus bound to a topic model." };
+		if (PyTuple_Size(args) != 3) throw py::ValueError{ "function takes 1 positional arguments." };
+		PyObject* tokenized_iter = PyTuple_GetItem(args, 0);
+		PyObject* raw_iter = PyTuple_GetItem(args, 1);
+		PyObject* metadata_iter = PyTuple_GetItem(args, 2);
+
+		size_t cnt = 0;
+
+		py::UniqueObj stopwords{ PyObject_GetAttrString((PyObject*)self, "_stopwords") };
+
+		py::foreach<PyObject*>(tokenized_iter, [&](PyObject* tokenized)
+		{
+			tomoto::RawDoc doc;
+			py::foreach<PyObject*>(tokenized, [&](PyObject* t)
+			{
+				if (PyUnicode_Check(t))
+				{
+					doc.words.emplace_back(self->vocab->vocabs->add(PyUnicode_AsUTF8(t)));
+				}
+				else if (PyTuple_Size(t) == 3)
+				{
+					PyObject* word = PyTuple_GetItem(t, 0);
+					PyObject* pos = PyTuple_GetItem(t, 1);
+					PyObject* len = PyTuple_GetItem(t, 2);
+					if (!(PyUnicode_Check(word) && PyLong_Check(pos) && PyLong_Check(len))) throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };
+
+					bool isStopword = false;
+					if (stopwords != Py_None)
+					{
+						py::UniqueObj stopRet{ PyObject_CallObject(stopwords, py::UniqueObj{ py::buildPyTuple(word) }) };
+						if (!stopRet) throw py::ExcPropagation{};
+						isStopword = PyObject_IsTrue(stopRet);
+					}
+					doc.words.emplace_back(isStopword ? -1 : self->vocab->vocabs->add(PyUnicode_AsUTF8(word)));
+					doc.origWordPos.emplace_back(PyLong_AsLong(pos));
+					doc.origWordLen.emplace_back(PyLong_AsLong(len));
+				}
+				else
+				{
+					throw py::ValueError{ "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`)." };
+				}
+			}, "`tokenizer` must return an iterable of `str` or `tuple` of (`str`, `int`, `int`).");
+			py::UniqueObj raw{ PyIter_Next(raw_iter) };
+			if (!raw) throw py::ExcPropagation{};
+			py::UniqueObj metadata{ PyIter_Next(metadata_iter) };
+			if (!metadata) throw py::ExcPropagation{};
+
+			doc.rawStr = tomoto::SharedString{ PyUnicode_AsUTF8(raw) };
+
+			PyObject* key, * value;
+			Py_ssize_t p = 0;
+			while (PyDict_Next(metadata, &p, &key, &value))
+			{
+				const char* utf8 = PyUnicode_AsUTF8(key);
+				if (utf8 == string{ "uid" })
+				{
+					if (value == Py_None) continue;
+					const char* uid = PyUnicode_AsUTF8(value);
+					if (!uid) throw py::ValueError{ "`uid` must be str type." };
+					string suid = uid;
+					if (suid.empty()) throw py::ValueError{ "wrong `uid` value : empty str not allowed" };
+					if (self->invmap.find(suid) != self->invmap.end())
+					{
+						throw py::ValueError{ "there is a document with uid = " + py::repr(value) + " already." };
+					}
+					self->invmap.emplace(suid, self->docs.size());
+					doc.docUid = tomoto::SharedString{ uid };
+					continue;
+				}
+
+				Py_INCREF(value);
+				doc.misc[utf8] = std::shared_ptr<void>{ value, [](void* p)
+				{
+					Py_XDECREF(p);
+				} };
+			}
+			self->docs.emplace_back(move(doc));
+			cnt++;
+		}, "");
+
+		return py::buildPyValue(cnt);
+	});
+}
+
+
 PyObject* CorpusObject::extractNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs)
 {
 	size_t minCf = 10, minDf = 5, maxLen = 5, maxCand = 5000;
@@ -797,6 +899,7 @@ static PyMethodDef UtilsCorpus_methods[] =
 	{ "__getstate__", (PyCFunction)CorpusObject::getstate, METH_NOARGS, "" },
 	{ "__setstate__", (PyCFunction)CorpusObject::setstate, METH_VARARGS, "" },
 	{ "add_doc", (PyCFunction)CorpusObject::addDoc, METH_VARARGS | METH_KEYWORDS, "" },
+	{ "add_docs", (PyCFunction)CorpusObject::addDocs, METH_VARARGS | METH_KEYWORDS, "" },
 	{ "extract_ngrams", (PyCFunction)CorpusObject::extractNgrams, METH_VARARGS | METH_KEYWORDS, "" },
 	{ "concat_ngrams", (PyCFunction)CorpusObject::concatNgrams, METH_VARARGS | METH_KEYWORDS, "" },
 	{ nullptr }
@@ -1025,10 +1128,14 @@ PyObject* DocumentObject::getSpan(DocumentObject* self, void* closure)
 {
 	return py::handleExc([&]()
 	{
-		PyObject* ret = PyList_New(self->doc->origWordPos.size());
-		for (size_t i = 0; i < self->doc->origWordPos.size(); ++i)
+		auto starts = self->doc->origWordPos;
+		auto lengthes = self->doc->origWordLen;
+		byte2Char(self->doc->rawStr, starts, lengthes);
+
+		PyObject* ret = PyList_New(starts.size());
+		for (size_t i = 0; i < starts.size(); ++i)
 		{
-			size_t begin = self->doc->origWordPos[i], end = begin + self->doc->origWordLen[i];
+			size_t begin = starts[i], end = begin + lengthes[i];
 			PyList_SET_ITEM(ret, i, py::buildPyTuple(begin, end));
 		}
 		return ret;

diff --git a/src/python/utils.h b/src/python/utils.h
@@ -77,6 +77,7 @@ struct CorpusObject
 	static PyObject* getstate(CorpusObject* self, PyObject*);
 	static PyObject* setstate(CorpusObject* self, PyObject* args);
 	static PyObject* addDoc(CorpusObject* self, PyObject* args, PyObject* kwargs);
+	static PyObject* addDocs(CorpusObject* self, PyObject* args, PyObject* kwargs);
 	static PyObject* extractNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs);
 	static PyObject* concatNgrams(CorpusObject* self, PyObject* args, PyObject* kwargs);
 	static Py_ssize_t len(CorpusObject* self);

diff --git a/tomotopy/__init__.py b/tomotopy/__init__.py
@@ -74,6 +74,7 @@ class ParallelScheme(IntEnum):
 import tomotopy.utils as utils
 import tomotopy.coherence as coherence
 import tomotopy.label as label
+import tomotopy.viewer as viewer
 
 import os
 if os.environ.get('TOMOTOPY_LANG') == 'kr':

diff --git a/tomotopy/_summary.py b/tomotopy/_summary.py
@@ -316,6 +316,21 @@ def topics_info_DTModel(mdl, file, topic_word_top_n):
             words = ' '.join(w for w, _ in mdl.get_topic_words(k, t, top_n=topic_word_top_n))
             print('|  t={} ({}) : {}'.format(t, topic_cnt[t, k], words), file=file)
 
+def basic_info(mdl, file=None):
+    call_method_bound(mdl, 'basic_info', globals(), file=file)
+
+def training_info(mdl, file=None):
+    call_method_bound(mdl, 'training_info', globals(), file=file)
+
+def initial_params_info(mdl, file=None):
+    call_method_bound(mdl, 'initial_params_info', globals(), file=file)
+
+def params_info(mdl, file=None):
+    call_method_bound(mdl, 'params_info', globals(), file=file)
+
+def topics_info(mdl, file=None, topic_word_top_n=5):
+    call_method_bound(mdl, 'topics_info', globals(), file=file, topic_word_top_n=topic_word_top_n)
+
 def summary(mdl, initial_hp=True, params=True, topic_word_top_n=5, file=None, flush=False):
     import tomotopy as tp
     import numpy as np
@@ -325,25 +340,25 @@ def summary(mdl, initial_hp=True, params=True, topic_word_top_n=5, file=None, fl
     flush = flush or False
 
     print('<Basic Info>', file=file)
-    call_method_bound(mdl, 'basic_info', globals(), file=file)
+    basic_info(mdl, file=file)
     print('|', file=file)
     print('<Training Info>', file=file)
-    call_method_bound(mdl, 'training_info', globals(), file=file)
+    training_info(mdl, file=file)
     print('|', file=file)
 
     if initial_hp:
         print('<Initial Parameters>', file=file)
-        call_method_bound(mdl, 'initial_params_info', globals(), file=file)
+        initial_params_info(mdl, file=file)
         print('|', file=file)
 
     if params:
         print('<Parameters>', file=file)
-        call_method_bound(mdl, 'params_info', globals(), file=file)
+        params_info(mdl, file=file)
         print('|', file=file)
 
     if topic_word_top_n > 0:
         print('<Topics>', file=file)
-        call_method_bound(mdl, 'topics_info', globals(), file=file, topic_word_top_n=topic_word_top_n)
+        topics_info(mdl, file=file, topic_word_top_n=topic_word_top_n)
         print('|', file=file)
 
     print(file=file, flush=flush)