From e085e9fd2a26cd4cdb02f16256b3ecfde8b577dd Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Tue, 5 Jun 2018 16:41:22 +0800 Subject: [PATCH 01/63] wrap core apis --- MANIFEST.in | 6 +++++ example.py | 5 ++-- setup.py | 4 +-- src/main.cpp | 76 ++++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 70 insertions(+), 21 deletions(-) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..cd7cefd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include README.md +include setup.py + +recursive-include cppjieba/include/cppjieba *.hpp +recursive-include cppjieba/dict *.utf8 +recursive-include cppjieba/deps *.h *.cc *.hpp diff --git a/example.py b/example.py index 62f47a1..c9eae04 100644 --- a/example.py +++ b/example.py @@ -1,8 +1,8 @@ -from cppjieba_py import jieba +from cppjieba_py import Tokenizer ,cut def main(): - jieba_instance = jieba("cppjieba/dict/user.dict.utf8") + jieba_instance = Tokenizer() seg_list = jieba_instance.cut("我来到北京清华大学") print("Full Mode: " + "/ ".join(seg_list)) # 全模式 @@ -14,6 +14,7 @@ def main(): "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) + print(cut("我来到北京清华大学")) if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 257eee5..5a2f852 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,8 @@ from setuptools.command.build_ext import build_ext import sys import setuptools - -__version__ = '0.0.1' +import os +__version__ = '0.0.2' class get_pybind_include(object): diff --git a/src/main.cpp b/src/main.cpp index c256236..08453a2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -9,31 +9,70 @@ const std::string DICT_PATH = "cppjieba/dict/jieba.dict.utf8"; const std::string HMM_PATH = "cppjieba/dict/hmm_model.utf8"; const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; +using namespace std; -struct JiebaCpp +struct Tokenizer { cppjieba::Jieba jieba; public: - JiebaCpp(const std::string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; + Tokenizer(const string &USER_DICT_PATH ) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; + Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; - std::vector cut(std::string &text) + vector cut(const string &sentence, bool hmm = true) { - std::vector words; - jieba.Cut(text, words, true); + vector words; + jieba.Cut(sentence, words, hmm); return words; - }; + } - std::vector cutForSearch(std::string &text) + vector cut_all(const string &sentence) { - std::vector words; - jieba.CutForSearch(text, words, true); + vector words; + jieba.CutAll(sentence, words); return words; + } + + vector cut_for_search(const string &sentence, bool hmm = true) + { + vector words; + jieba.CutForSearch(sentence, words, hmm); + return words; + } + +}; + + +namespace Jieba +{ + Tokenizer* dt; + + void initlize(){ + dt = new Tokenizer(); }; - void InsertUserWord(std::string &word) + void init_check(){ + if(!dt){ + initlize(); + } + }; + + vector cut(const string &sentence, bool hmm = true) + { + init_check(); + return dt->cut(sentence,hmm); + }; + + vector cut_all(const string &sentence) { - jieba.InsertUserWord(word); + init_check(); + return dt->cut_all(sentence); + }; + + vector cut_for_search(const string &sentence, bool hmm = true) + { + init_check(); + return dt->cut_for_search(sentence,hmm); }; }; @@ -41,10 +80,13 @@ struct JiebaCpp PYBIND11_MODULE(cppjieba_py, m) { m.doc() = "python extension for cppjieba"; // optional module docstring - - py::class_(m, "jieba") - .def(py::init()) - .def("cut", &JiebaCpp::cut) - .def("cut_for_search", &JiebaCpp::cutForSearch) - .def("add_word", &JiebaCpp::InsertUserWord); + m.def("cut", &Jieba::cut,py::arg("sentence"),py::arg("hmm") = true); + m.def("cut_all", &Jieba::cut_all); + m.def("cut_for_search", &Jieba::cut_for_search,py::arg("sentence"),py::arg("hmm") = true); + py::class_(m, "Tokenizer") + .def(py::init<>()) + .def(py::init()) + .def("cut", &Tokenizer::cut,py::arg("sentence"),py::arg("hmm") = true) + .def("cut_all", &Tokenizer::cut_all) + .def("cut_for_search", &Tokenizer::cut_for_search,py::arg("sentence"),py::arg("hmm") = true); } \ No newline at end of file From 002716a779c3537cff525df36600612c349cdf34 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Tue, 5 Jun 2018 23:51:58 +0800 Subject: [PATCH 02/63] cut* and lcut* --- cppjiebapy/__init__.py | 25 +++++++ example.py | 10 ++- setup.py | 1 + src/main.cpp | 149 ++++++++++++++++++++++++++++++----------- 4 files changed, 144 insertions(+), 41 deletions(-) create mode 100644 cppjiebapy/__init__.py diff --git a/cppjiebapy/__init__.py b/cppjiebapy/__init__.py new file mode 100644 index 0000000..3d829ba --- /dev/null +++ b/cppjiebapy/__init__.py @@ -0,0 +1,25 @@ +from cppjieba_py import cut_for_search_internal,cut_internal as cut_i +from cppjieba_py import Tokenizer +from cppjieba_py import lcut,lcut_for_search +def cut(*args,**kvargs): + it = cut_i(*args,**kvargs) + for word in it: + yield word.word + +def cut_for_search(*args,**kvargs): + it = cut_for_search_internal(*args,**kvargs) + for word in it: + yield word.word + +def c_cut(ins,*args,**kvargs): + it = ins.cut_internal(*args,**kvargs) + for word in it: + yield word.word + +def c_cut_for_search(ins,*args,**kvargs): + it = ins.cut_for_search_internal(*args,**kvargs) + for word in it: + yield word.word + +setattr(Tokenizer,"cut",c_cut) +setattr(Tokenizer,"cut_for_search",c_cut_for_search) diff --git a/example.py b/example.py index c9eae04..027df35 100644 --- a/example.py +++ b/example.py @@ -1,4 +1,4 @@ -from cppjieba_py import Tokenizer ,cut +from cppjiebapy import Tokenizer ,cut,cut_for_search,lcut,lcut_for_search def main(): @@ -7,14 +7,18 @@ def main(): print("Full Mode: " + "/ ".join(seg_list)) # 全模式 - seg_list = jieba_instance.cut("他来到了网易杭研大厦") # 默认是精确模式 + seg_list = jieba_instance.lcut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) - print(cut("我来到北京清华大学")) + print(lcut("我来到北京清华大学")) + print(list(cut("我来到北京清华大学"))) + + print(lcut_for_search("我来到北京清华大学")) + print(list(cut_for_search("我来到北京清华大学"))) if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 5a2f852..c9c7285 100644 --- a/setup.py +++ b/setup.py @@ -99,6 +99,7 @@ def build_extensions(self): url='https://github.com/fantasy/cppjieba-py', description='A python extension for cppjieba', long_description='', + packages=['cppjiebapy'], ext_modules=ext_modules, install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, diff --git a/src/main.cpp b/src/main.cpp index 08453a2..94a5d39 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include "cppjieba/Jieba.hpp" #include @@ -11,82 +12,154 @@ const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; using namespace std; +PYBIND11_MAKE_OPAQUE(std::vector); + +using Word = cppjieba::Word; + +using WordVector = std::vector; + struct Tokenizer { cppjieba::Jieba jieba; public: - Tokenizer(const string &USER_DICT_PATH ) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; + Tokenizer(const string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; - vector cut(const string &sentence, bool hmm = true) + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) + { + WordVector words; + if (cut_all) + { + jieba.Cut(sentence, words, HMM); + } + else + { + jieba.CutAll(sentence, words); + } + return words; + } + + vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) { vector words; - jieba.Cut(sentence, words, hmm); + if (cut_all) + { + jieba.Cut(sentence, words, HMM); + } + else + { + jieba.CutAll(sentence, words); + } + return words; } - vector cut_all(const string &sentence) + vector lcut_all(const string &sentence) { vector words; jieba.CutAll(sentence, words); return words; } - vector cut_for_search(const string &sentence, bool hmm = true) + WordVector cut_for_search_internal(const string &sentence, bool HMM = true) { - vector words; - jieba.CutForSearch(sentence, words, hmm); + WordVector words; + jieba.CutForSearch(sentence, words, HMM); return words; } + vector lcut_for_search(const string &sentence, bool HMM = true) + { + vector words; + jieba.CutForSearch(sentence, words, HMM); + return words; + } }; - namespace Jieba { - Tokenizer* dt; - - void initlize(){ - dt = new Tokenizer(); - }; +Tokenizer *dt; - void init_check(){ - if(!dt){ - initlize(); - } - }; +void initlize() +{ + dt = new Tokenizer(); +}; - vector cut(const string &sentence, bool hmm = true) +void init_check() +{ + if (!dt) { - init_check(); - return dt->cut(sentence,hmm); - }; + initlize(); + } +}; - vector cut_all(const string &sentence) - { - init_check(); - return dt->cut_all(sentence); - }; + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->cut_internal(sentence, cut_all, HMM); +}; - vector cut_for_search(const string &sentence, bool hmm = true) - { - init_check(); - return dt->cut_for_search(sentence,hmm); - }; +vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->lcut(sentence, cut_all, HMM); +}; +vector lcut_all(const string &sentence) +{ + init_check(); + return dt->lcut_all(sentence); }; +WordVector cut_for_search_internal(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->cut_for_search_internal(sentence, HMM); +}; + +vector lcut_for_search(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->lcut_for_search(sentence, HMM); +}; + +}; // namespace Jieba + PYBIND11_MODULE(cppjieba_py, m) { m.doc() = "python extension for cppjieba"; // optional module docstring - m.def("cut", &Jieba::cut,py::arg("sentence"),py::arg("hmm") = true); - m.def("cut_all", &Jieba::cut_all); - m.def("cut_for_search", &Jieba::cut_for_search,py::arg("sentence"),py::arg("hmm") = true); + + py::class_(m, "Word") + .def_readonly("word", &Word::word, pybind11::return_value_policy::take_ownership) + .def("__str__", [](const Word &v) { + return v.word; + }) + .def("__repr__", [](const Word &v) { + return v.word; + }); + + py::class_(m, "WordVector") + .def(py::init<>()) + .def("clear", &WordVector::clear) + // .def("push_back", &WordVector::push_back) + .def("__len__", [](const WordVector &v) { return v.size(); }) + .def("__iter__", [](WordVector &v) { + return py::make_iterator<>(v.begin(), v.end()); + }); + + m.def("cut_internal", &Jieba::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("lcut", &Jieba::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("lcut_all", &Jieba::lcut_all); + m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); + m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); + py::class_(m, "Tokenizer") .def(py::init<>()) .def(py::init()) - .def("cut", &Tokenizer::cut,py::arg("sentence"),py::arg("hmm") = true) - .def("cut_all", &Tokenizer::cut_all) - .def("cut_for_search", &Tokenizer::cut_for_search,py::arg("sentence"),py::arg("hmm") = true); + .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) + .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) + .def("lcut_all", &Tokenizer::lcut_all) + .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) + .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); } \ No newline at end of file From fe0f9591216fbd2df313f5ae4d150e5003b82160 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 01:08:22 +0800 Subject: [PATCH 03/63] posseg --- MANIFEST.in | 2 +- cppjiebapy/__init__.py | 5 +- cppjiebapy/posseg.py | 9 ++++ example.py | 4 ++ setup.py | 2 +- src/main.cpp | 108 ++++++++++++++++++++++++----------------- 6 files changed, 81 insertions(+), 49 deletions(-) create mode 100644 cppjiebapy/posseg.py diff --git a/MANIFEST.in b/MANIFEST.in index cd7cefd..30317ad 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,4 +3,4 @@ include setup.py recursive-include cppjieba/include/cppjieba *.hpp recursive-include cppjieba/dict *.utf8 -recursive-include cppjieba/deps *.h *.cc *.hpp +recursive-include cppjieba/deps *.hpp diff --git a/cppjiebapy/__init__.py b/cppjiebapy/__init__.py index 3d829ba..67c6313 100644 --- a/cppjiebapy/__init__.py +++ b/cppjiebapy/__init__.py @@ -1,8 +1,9 @@ -from cppjieba_py import cut_for_search_internal,cut_internal as cut_i +from cppjieba_py import cut_for_search_internal,tag_internal,cut_internal from cppjieba_py import Tokenizer from cppjieba_py import lcut,lcut_for_search + def cut(*args,**kvargs): - it = cut_i(*args,**kvargs) + it = cut_internal(*args,**kvargs) for word in it: yield word.word diff --git a/cppjiebapy/posseg.py b/cppjiebapy/posseg.py new file mode 100644 index 0000000..d582329 --- /dev/null +++ b/cppjiebapy/posseg.py @@ -0,0 +1,9 @@ +from cppjieba_py import tag_internal + +def cut(sentence): + it = tag_internal(sentence) + for word,tag in it: + yield (word,tag) + +def lcut(sentence): + return list(tag_internal(sentence)) \ No newline at end of file diff --git a/example.py b/example.py index 027df35..dd27991 100644 --- a/example.py +++ b/example.py @@ -1,4 +1,5 @@ from cppjiebapy import Tokenizer ,cut,cut_for_search,lcut,lcut_for_search +import cppjiebapy.posseg as pseg def main(): @@ -20,5 +21,8 @@ def main(): print(lcut_for_search("我来到北京清华大学")) print(list(cut_for_search("我来到北京清华大学"))) + print(pseg.lcut("我来到北京清华大学")) + print(list(pseg.cut("我来到北京清华大学"))) + if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index c9c7285..8a5409e 100644 --- a/setup.py +++ b/setup.py @@ -103,5 +103,5 @@ def build_extensions(self): ext_modules=ext_modules, install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, - zip_safe=False, + # zip_safe=False, ) diff --git a/src/main.cpp b/src/main.cpp index 94a5d39..ae63568 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,6 +18,8 @@ using Word = cppjieba::Word; using WordVector = std::vector; +using WordsTaged = vector>; + struct Tokenizer { cppjieba::Jieba jieba; @@ -26,7 +28,7 @@ struct Tokenizer Tokenizer(const string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; - WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) { WordVector words; if (cut_all) @@ -38,7 +40,7 @@ struct Tokenizer jieba.CutAll(sentence, words); } return words; - } + }; vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) { @@ -53,76 +55,91 @@ struct Tokenizer } return words; - } + }; vector lcut_all(const string &sentence) { vector words; jieba.CutAll(sentence, words); return words; - } + }; WordVector cut_for_search_internal(const string &sentence, bool HMM = true) { WordVector words; jieba.CutForSearch(sentence, words, HMM); return words; - } + }; vector lcut_for_search(const string &sentence, bool HMM = true) { vector words; jieba.CutForSearch(sentence, words, HMM); return words; - } + }; + + WordsTaged tag_internal(const string &sentence) + { + WordsTaged words; + jieba.Tag(sentence, words); + return words; + }; }; -namespace Jieba -{ -Tokenizer *dt; -void initlize() -{ - dt = new Tokenizer(); -}; -void init_check() + namespace Jieba { - if (!dt) + Tokenizer *dt; + + void initlize() { - initlize(); - } -}; + dt = new Tokenizer(); + }; - WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) -{ - init_check(); - return dt->cut_internal(sentence, cut_all, HMM); -}; + void init_check() + { + if (!dt) + { + initlize(); + } + }; -vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) -{ - init_check(); - return dt->lcut(sentence, cut_all, HMM); -}; + WordsTaged tag_internal(const string &sentence) + { + init_check(); + return dt->tag_internal(sentence); + }; -vector lcut_all(const string &sentence) -{ - init_check(); - return dt->lcut_all(sentence); -}; + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) + { + init_check(); + return dt->cut_internal(sentence, cut_all, HMM); + }; -WordVector cut_for_search_internal(const string &sentence, bool HMM = true) -{ - init_check(); - return dt->cut_for_search_internal(sentence, HMM); -}; + vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) + { + init_check(); + return dt->lcut(sentence, cut_all, HMM); + }; -vector lcut_for_search(const string &sentence, bool HMM = true) -{ - init_check(); - return dt->lcut_for_search(sentence, HMM); -}; + vector lcut_all(const string &sentence) + { + init_check(); + return dt->lcut_all(sentence); + }; + + WordVector cut_for_search_internal(const string &sentence, bool HMM = true) + { + init_check(); + return dt->cut_for_search_internal(sentence, HMM); + }; + + vector lcut_for_search(const string &sentence, bool HMM = true) + { + init_check(); + return dt->lcut_for_search(sentence, HMM); + }; }; // namespace Jieba @@ -142,7 +159,6 @@ PYBIND11_MODULE(cppjieba_py, m) py::class_(m, "WordVector") .def(py::init<>()) .def("clear", &WordVector::clear) - // .def("push_back", &WordVector::push_back) .def("__len__", [](const WordVector &v) { return v.size(); }) .def("__iter__", [](WordVector &v) { return py::make_iterator<>(v.begin(), v.end()); @@ -153,6 +169,7 @@ PYBIND11_MODULE(cppjieba_py, m) m.def("lcut_all", &Jieba::lcut_all); m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); + m.def("tag_internal", &Jieba::tag_internal, py::arg("sentence")); py::class_(m, "Tokenizer") .def(py::init<>()) @@ -161,5 +178,6 @@ PYBIND11_MODULE(cppjieba_py, m) .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut_all", &Tokenizer::lcut_all) .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) - .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); + .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) + .def("tag_internal", &Tokenizer::tag_internal, py::arg("sentence")); } \ No newline at end of file From ccb8a571d78ec90fed273dd558db264023d16141 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 17:38:48 +0800 Subject: [PATCH 04/63] python yield from vector significant slow than vector,so just wrap iter() to vector --- README.md | 4 ++-- cppjiebapy/__init__.py | 12 ++++-------- example.py | 1 + performace_test/cppjieba.py | 31 +++++++++++++++++++++++++++++++ src/main.cpp | 21 ++------------------- 5 files changed, 40 insertions(+), 29 deletions(-) create mode 100644 performace_test/cppjieba.py diff --git a/README.md b/README.md index 5cae8b3..0a1786c 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 | 方案 | 速度 | | ------------- |:-------------:| -| cppjieba-py | 8s | -| jieba | 77s | +| cppjieba-py | 6.218346 | +| jieba | 1:24.703040 | ## 使用 diff --git a/cppjiebapy/__init__.py b/cppjiebapy/__init__.py index 67c6313..d3d890d 100644 --- a/cppjiebapy/__init__.py +++ b/cppjiebapy/__init__.py @@ -4,23 +4,19 @@ def cut(*args,**kvargs): it = cut_internal(*args,**kvargs) - for word in it: - yield word.word + return iter(it) def cut_for_search(*args,**kvargs): it = cut_for_search_internal(*args,**kvargs) - for word in it: - yield word.word + return iter(it) def c_cut(ins,*args,**kvargs): it = ins.cut_internal(*args,**kvargs) - for word in it: - yield word.word + return iter(it) def c_cut_for_search(ins,*args,**kvargs): it = ins.cut_for_search_internal(*args,**kvargs) - for word in it: - yield word.word + return iter(it) setattr(Tokenizer,"cut",c_cut) setattr(Tokenizer,"cut_for_search",c_cut_for_search) diff --git a/example.py b/example.py index dd27991..ad80c83 100644 --- a/example.py +++ b/example.py @@ -5,6 +5,7 @@ def main(): jieba_instance = Tokenizer() seg_list = jieba_instance.cut("我来到北京清华大学") + print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 diff --git a/performace_test/cppjieba.py b/performace_test/cppjieba.py new file mode 100644 index 0000000..976d990 --- /dev/null +++ b/performace_test/cppjieba.py @@ -0,0 +1,31 @@ +#encoding=utf-8 +import sys +import os +import random +import datetime + +#wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8 + +if __name__ == "__main__": + if sys.argv[1] == "cppjiebapy":# 0:00:03.861202 + import cppjiebapy as jieba + elif sys.argv[1] == "jieba": # 0:01:24.703040 + import jieba + lines = [] + weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") + for line in open(weicheng): + lines.append(line.strip()); + + + result = [""] * 10; + result[random.randint(0, 9)] = '/'.join(jieba.cut("南京长江大桥")) + starttime = datetime.datetime.now() + + for i in range(50): + for line in lines: + r = '/'.join(jieba.cut(line)) + # print(r) + result[random.randint(0, 9)] = r + #result[random.randint(0, 9)] = jieba.cut(line) + endtime = datetime.datetime.now() + print (endtime - starttime) \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index ae63568..49529ef 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,11 +12,11 @@ const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; using namespace std; -PYBIND11_MAKE_OPAQUE(std::vector); +// PYBIND11_MAKE_OPAQUE(std::vector); using Word = cppjieba::Word; -using WordVector = std::vector; +using WordVector = std::vector; using WordsTaged = vector>; @@ -147,23 +147,6 @@ PYBIND11_MODULE(cppjieba_py, m) { m.doc() = "python extension for cppjieba"; // optional module docstring - py::class_(m, "Word") - .def_readonly("word", &Word::word, pybind11::return_value_policy::take_ownership) - .def("__str__", [](const Word &v) { - return v.word; - }) - .def("__repr__", [](const Word &v) { - return v.word; - }); - - py::class_(m, "WordVector") - .def(py::init<>()) - .def("clear", &WordVector::clear) - .def("__len__", [](const WordVector &v) { return v.size(); }) - .def("__iter__", [](WordVector &v) { - return py::make_iterator<>(v.begin(), v.end()); - }); - m.def("cut_internal", &Jieba::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); m.def("lcut", &Jieba::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); m.def("lcut_all", &Jieba::lcut_all); From e688f94ba6bc0a658ad5632e3fbec69a8640316d Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 17:49:29 +0800 Subject: [PATCH 05/63] add .initialize --- cppjiebapy/__init__.py | 2 +- example.py | 13 ++++++++++--- src/main.cpp | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cppjiebapy/__init__.py b/cppjiebapy/__init__.py index d3d890d..5c22846 100644 --- a/cppjiebapy/__init__.py +++ b/cppjiebapy/__init__.py @@ -1,6 +1,6 @@ from cppjieba_py import cut_for_search_internal,tag_internal,cut_internal from cppjieba_py import Tokenizer -from cppjieba_py import lcut,lcut_for_search +from cppjieba_py import lcut,lcut_for_search,initialize def cut(*args,**kvargs): it = cut_internal(*args,**kvargs) diff --git a/example.py b/example.py index ad80c83..cfd682e 100644 --- a/example.py +++ b/example.py @@ -1,5 +1,7 @@ -from cppjiebapy import Tokenizer ,cut,cut_for_search,lcut,lcut_for_search +from cppjiebapy import Tokenizer, cut, cut_for_search, lcut, lcut_for_search, initialize import cppjiebapy.posseg as pseg +import datetime + def main(): @@ -8,14 +10,18 @@ def main(): print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 - seg_list = jieba_instance.lcut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) - + + t1 = datetime.datetime.now() + initialize() + t2 = datetime.datetime.now() + print("initialize costs:%s" % (t2 - t1)) + print(lcut("我来到北京清华大学")) print(list(cut("我来到北京清华大学"))) @@ -25,5 +31,6 @@ def main(): print(pseg.lcut("我来到北京清华大学")) print(list(pseg.cut("我来到北京清华大学"))) + if __name__ == '__main__': main() diff --git a/src/main.cpp b/src/main.cpp index 49529ef..82a681e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -92,7 +92,7 @@ struct Tokenizer { Tokenizer *dt; - void initlize() + void initialize() { dt = new Tokenizer(); }; @@ -101,7 +101,7 @@ struct Tokenizer { if (!dt) { - initlize(); + initialize(); } }; @@ -153,6 +153,7 @@ PYBIND11_MODULE(cppjieba_py, m) m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); m.def("tag_internal", &Jieba::tag_internal, py::arg("sentence")); + m.def("initialize", &Jieba::initialize); py::class_(m, "Tokenizer") .def(py::init<>()) From f15d9a08c6e8a08298297cbb95ba5705a1d1e683 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 17:59:05 +0800 Subject: [PATCH 06/63] renames files --- {cppjiebapy => cppjieba_py}/__init__.py | 6 +++--- {cppjiebapy => cppjieba_py}/posseg.py | 2 +- example.py | 4 ++-- performace_test/cppjieba.py | 4 ++-- setup.py | 4 ++-- src/main.cpp | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) rename {cppjiebapy => cppjieba_py}/__init__.py (76%) rename {cppjiebapy => cppjieba_py}/posseg.py (82%) diff --git a/cppjiebapy/__init__.py b/cppjieba_py/__init__.py similarity index 76% rename from cppjiebapy/__init__.py rename to cppjieba_py/__init__.py index 5c22846..bc43cc3 100644 --- a/cppjiebapy/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,6 +1,6 @@ -from cppjieba_py import cut_for_search_internal,tag_internal,cut_internal -from cppjieba_py import Tokenizer -from cppjieba_py import lcut,lcut_for_search,initialize +from libcppjieba import cut_for_search_internal,tag_internal,cut_internal +from libcppjieba import Tokenizer +from libcppjieba import lcut,lcut_for_search,initialize def cut(*args,**kvargs): it = cut_internal(*args,**kvargs) diff --git a/cppjiebapy/posseg.py b/cppjieba_py/posseg.py similarity index 82% rename from cppjiebapy/posseg.py rename to cppjieba_py/posseg.py index d582329..8cc4657 100644 --- a/cppjiebapy/posseg.py +++ b/cppjieba_py/posseg.py @@ -1,4 +1,4 @@ -from cppjieba_py import tag_internal +from libcppjieba import tag_internal def cut(sentence): it = tag_internal(sentence) diff --git a/example.py b/example.py index cfd682e..eeb031b 100644 --- a/example.py +++ b/example.py @@ -1,5 +1,5 @@ -from cppjiebapy import Tokenizer, cut, cut_for_search, lcut, lcut_for_search, initialize -import cppjiebapy.posseg as pseg +from cppjieba_py import Tokenizer, cut, cut_for_search, lcut, lcut_for_search, initialize +import cppjieba_py.posseg as pseg import datetime diff --git a/performace_test/cppjieba.py b/performace_test/cppjieba.py index 976d990..83f67c9 100644 --- a/performace_test/cppjieba.py +++ b/performace_test/cppjieba.py @@ -7,8 +7,8 @@ #wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8 if __name__ == "__main__": - if sys.argv[1] == "cppjiebapy":# 0:00:03.861202 - import cppjiebapy as jieba + if sys.argv[1] == "cppjieba_py":# 0:00:03.861202 + import cppjieba_py as jieba elif sys.argv[1] == "jieba": # 0:01:24.703040 import jieba lines = [] diff --git a/setup.py b/setup.py index 8a5409e..6f6bf88 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def __str__(self): ext_modules = [ Extension( - 'cppjieba_py', + 'libcppjieba', ['src/main.cpp'], include_dirs=[ # Path to pybind11 headers @@ -99,7 +99,7 @@ def build_extensions(self): url='https://github.com/fantasy/cppjieba-py', description='A python extension for cppjieba', long_description='', - packages=['cppjiebapy'], + packages=['cppjieba_py'], ext_modules=ext_modules, install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, diff --git a/src/main.cpp b/src/main.cpp index 82a681e..c8cd199 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -143,7 +143,7 @@ struct Tokenizer }; // namespace Jieba -PYBIND11_MODULE(cppjieba_py, m) +PYBIND11_MODULE(libcppjieba, m) { m.doc() = "python extension for cppjieba"; // optional module docstring From 0a01f1f2457072c16a8ca4e99c250e849d84eba4 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 18:06:26 +0800 Subject: [PATCH 07/63] update readme --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0a1786c..19ad71d 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 ```python # -*- coding: utf-8 -*- -from cppjieba_py import jieba - -jieba_instance = jieba("cppjieba/dict/user.dict.utf8") +import cppjieba_py as jieba +# or use defualt Tokenizer: jieba.cut +jieba_instance = Tokenizer("cppjieba/dict/user.dict.utf8") seg_list = jieba_instance.cut("我来到北京清华大学") print("Full Mode: " + "/ ".join(seg_list)) # 全模式 @@ -34,7 +34,12 @@ print(", ".join(seg_list)) ``` -## 安装 +## 安装 + +* 从发行包安装 + see [releases](https://github.com/bung87/cppjieba-py/releases) + + ```pip install https://github.com/bung87/cppjieba-py/files//cppjieba_py-.tar.gz``` * 从源代码安装 From 25ed3ff53500655ca6749ca13b48892ad95a888b Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 18:10:13 +0800 Subject: [PATCH 08/63] update version --- cppjieba_py/__init__.py | 2 ++ setup.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index bc43cc3..9eb2787 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -20,3 +20,5 @@ def c_cut_for_search(ins,*args,**kvargs): setattr(Tokenizer,"cut",c_cut) setattr(Tokenizer,"cut_for_search",c_cut_for_search) + +__version__ = '0.0.3' diff --git a/setup.py b/setup.py index 6f6bf88..31c8c23 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,10 @@ import sys import setuptools import os -__version__ = '0.0.2' +from cppjieba_py import __version__ + class get_pybind_include(object): """Helper class to determine the pybind11 include path The purpose of this class is to postpone importing pybind11 @@ -94,9 +95,8 @@ def build_extensions(self): setup( name='cppjieba_py', version=__version__, - author='yeping zheng', - author_email='fantasy614@gmail.com', - url='https://github.com/fantasy/cppjieba-py', + author='bung87,yeping zheng', + url='https://github.com/bung87/cppjieba-py/', description='A python extension for cppjieba', long_description='', packages=['cppjieba_py'], From 92bde245eedc93f5ed8de3123cc3e976725d4df8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 18:19:46 +0800 Subject: [PATCH 09/63] update readme --- README.md | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 19ad71d..b33d73c 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,24 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 -## 性能 +## 安装 -测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 +* pypi -| 方案 | 速度 | -| ------------- |:-------------:| -| cppjieba-py | 6.218346 | -| jieba | 1:24.703040 | + ```pip install cppjieba-py``` + +* 从发行包安装 + see [releases](https://github.com/bung87/cppjieba-py/releases) + + ```pip install https://github.com/bung87/cppjieba-py/files//cppjieba_py-.tar.gz``` + +* 从源代码安装 + + ``` + $ git clone --recursive https://github.com/bung87/cppjieba-py + $ python setup.py build + $ python setup.py install + ``` ## 使用 @@ -34,17 +44,11 @@ print(", ".join(seg_list)) ``` -## 安装 - -* 从发行包安装 - see [releases](https://github.com/bung87/cppjieba-py/releases) - - ```pip install https://github.com/bung87/cppjieba-py/files//cppjieba_py-.tar.gz``` +## 性能 -* 从源代码安装 +测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 - ``` - $ git clone --recursive https://github.com/fantasy/cppjieba-py - $ python setup.py build - $ python setup.py install - ``` \ No newline at end of file +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 6.218346 | +| jieba | 1:24.703040 | \ No newline at end of file From 2f5ee88eba18d967d0547794d749a66c57c5eea8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 18:41:33 +0800 Subject: [PATCH 10/63] compare to jieba_fast --- README.md | 1 + performace_test/cppjieba.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index b33d73c..35373d9 100644 --- a/README.md +++ b/README.md @@ -51,4 +51,5 @@ print(", ".join(seg_list)) | 方案 | 速度 | | ------------- |:-------------:| | cppjieba-py | 6.218346 | +| jieba_fast | 31.315147 | | jieba | 1:24.703040 | \ No newline at end of file diff --git a/performace_test/cppjieba.py b/performace_test/cppjieba.py index 83f67c9..7477ca8 100644 --- a/performace_test/cppjieba.py +++ b/performace_test/cppjieba.py @@ -11,6 +11,8 @@ import cppjieba_py as jieba elif sys.argv[1] == "jieba": # 0:01:24.703040 import jieba + elif sys.argv[1] == "jieba_fast": + import jieba_fast as jieba lines = [] weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") for line in open(weicheng): From ec5eb8d0dc09d8c340bf3f7ac334404426dd1381 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 19:00:45 +0800 Subject: [PATCH 11/63] posseg --- cppjieba_py/posseg.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cppjieba_py/posseg.py b/cppjieba_py/posseg.py index 8cc4657..e4f3078 100644 --- a/cppjieba_py/posseg.py +++ b/cppjieba_py/posseg.py @@ -2,8 +2,7 @@ def cut(sentence): it = tag_internal(sentence) - for word,tag in it: - yield (word,tag) + return iter(it) def lcut(sentence): - return list(tag_internal(sentence)) \ No newline at end of file + return tag_internal(sentence) \ No newline at end of file From 205a13755ade50004c7b4ec447dbd9bccf38191b Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 22:43:53 +0800 Subject: [PATCH 12/63] analyse.extract_tags --- cppjieba_py/__init__.py | 31 +++++++--- example.py | 13 +++-- setup.py | 5 +- src/main.cpp | 124 ++++++++++++++++++++++++++++------------ 4 files changed, 120 insertions(+), 53 deletions(-) diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index 9eb2787..f672d9d 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,24 +1,39 @@ -from libcppjieba import cut_for_search_internal,tag_internal,cut_internal +from libcppjieba import cut_for_search_internal as _cut_for_search_internal,\ + tag_internal as _tag_internal,\ + cut_internal as _cut_internal from libcppjieba import Tokenizer from libcppjieba import lcut,lcut_for_search,initialize +def _iter_wraps_doc(origin): + return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1) + def cut(*args,**kvargs): - it = cut_internal(*args,**kvargs) + it = _cut_internal(*args,**kvargs) return iter(it) +cut.__doc__ = _iter_wraps_doc(_cut_internal) + def cut_for_search(*args,**kvargs): - it = cut_for_search_internal(*args,**kvargs) + it = _cut_for_search_internal(*args,**kvargs) return iter(it) + +cut_for_search.__doc__ = _iter_wraps_doc(_cut_for_search_internal) -def c_cut(ins,*args,**kvargs): +def _c_cut(ins,*args,**kvargs): it = ins.cut_internal(*args,**kvargs) return iter(it) -def c_cut_for_search(ins,*args,**kvargs): +def _c_cut_for_search(ins,*args,**kvargs): it = ins.cut_for_search_internal(*args,**kvargs) return iter(it) -setattr(Tokenizer,"cut",c_cut) -setattr(Tokenizer,"cut_for_search",c_cut_for_search) +_c_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal) + +_c_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal) + +setattr(Tokenizer,"cut",_c_cut) +setattr(Tokenizer,"cut_for_search",_c_cut_for_search) + +from libcppjieba import get_default_analyse -__version__ = '0.0.3' +analyse = get_default_analyse() \ No newline at end of file diff --git a/example.py b/example.py index eeb031b..6e22302 100644 --- a/example.py +++ b/example.py @@ -1,16 +1,16 @@ from cppjieba_py import Tokenizer, cut, cut_for_search, lcut, lcut_for_search, initialize import cppjieba_py.posseg as pseg import datetime - +from cppjieba_py import analyse def main(): jieba_instance = Tokenizer() - seg_list = jieba_instance.cut("我来到北京清华大学") + seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 - seg_list = jieba_instance.lcut("他来到了网易杭研大厦") # 默认是精确模式 + seg_list = jieba_instance.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( @@ -24,13 +24,16 @@ def main(): print(lcut("我来到北京清华大学")) print(list(cut("我来到北京清华大学"))) - + print(cut("我来到北京清华大学",cut_all=True)) print(lcut_for_search("我来到北京清华大学")) print(list(cut_for_search("我来到北京清华大学"))) print(pseg.lcut("我来到北京清华大学")) print(list(pseg.cut("我来到北京清华大学"))) - + s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + r = analyse.extract_tags(s) + print(r) + if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index 31c8c23..9346340 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,7 @@ import setuptools import os - -from cppjieba_py import __version__ +__version__ = '0.0.4' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -99,8 +98,8 @@ def build_extensions(self): url='https://github.com/bung87/cppjieba-py/', description='A python extension for cppjieba', long_description='', - packages=['cppjieba_py'], ext_modules=ext_modules, + packages=['cppjieba_py'], install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, # zip_safe=False, diff --git a/src/main.cpp b/src/main.cpp index c8cd199..4b40f5c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -86,60 +86,106 @@ struct Tokenizer }; }; - - - namespace Jieba +namespace Jieba { - Tokenizer *dt; +struct Analyse +{ + private: + Tokenizer *tokenizer; + cppjieba::KeywordExtractor *keywordExtractor; - void initialize() + public: + Analyse(Tokenizer *t) : tokenizer(t) { - dt = new Tokenizer(); + initKeyowrdExtractor(); }; - void init_check() + vector extract_tags(const string &sentence, size_t topK = 20) { - if (!dt) - { - initialize(); - } + vector keywords; + keywordExtractor->Extract(sentence, keywords, topK); + return keywords; }; - WordsTaged tag_internal(const string &sentence) + void initKeyowrdExtractor(const string &idfPath = IDF_PATH, + const string &stopWordPath = STOP_WORD_PATH) { - init_check(); - return dt->tag_internal(sentence); + keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), idfPath, stopWordPath); }; +}; - WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) - { - init_check(); - return dt->cut_internal(sentence, cut_all, HMM); - }; +Tokenizer *dt; +Analyse *analyse; - vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) - { - init_check(); - return dt->lcut(sentence, cut_all, HMM); - }; +void initialize() +{ - vector lcut_all(const string &sentence) - { - init_check(); - return dt->lcut_all(sentence); - }; + dt = new Tokenizer(); +}; - WordVector cut_for_search_internal(const string &sentence, bool HMM = true) +void init_check() +{ + if (!dt) { - init_check(); - return dt->cut_for_search_internal(sentence, HMM); - }; + initialize(); + } +}; - vector lcut_for_search(const string &sentence, bool HMM = true) +Tokenizer *get_default_tokenizer() +{ + init_check(); + return dt; +}; + +void init_check_analyse() +{ + if (!analyse) { - init_check(); - return dt->lcut_for_search(sentence, HMM); - }; + analyse = new Analyse(get_default_tokenizer()); + } +}; + +Analyse *get_default_analyse() +{ + init_check_analyse(); + return analyse; +}; + +WordsTaged tag_internal(const string &sentence) +{ + init_check(); + return dt->tag_internal(sentence); +}; + +WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->cut_internal(sentence, cut_all, HMM); +}; + +vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->lcut(sentence, cut_all, HMM); +}; + +vector lcut_all(const string &sentence) +{ + init_check(); + return dt->lcut_all(sentence); +}; + +WordVector cut_for_search_internal(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->cut_for_search_internal(sentence, HMM); +}; + +vector lcut_for_search(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->lcut_for_search(sentence, HMM); +}; }; // namespace Jieba @@ -154,6 +200,10 @@ PYBIND11_MODULE(libcppjieba, m) m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); m.def("tag_internal", &Jieba::tag_internal, py::arg("sentence")); m.def("initialize", &Jieba::initialize); + m.def("get_default_analyse", &Jieba::get_default_analyse); + py::class_(m, "Analyse") + .def(py::init()) + .def("extract_tags", &Jieba::Analyse::extract_tags, py::arg("sentence"), py::arg("topK") = 20); py::class_(m, "Tokenizer") .def(py::init<>()) From 05c5820bc79df3e243feb9b3e7a9edccec86b8a1 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Jun 2018 23:34:51 +0800 Subject: [PATCH 13/63] export .cut_all --- README.md | 2 +- cppjieba_py/__init__.py | 5 +++++ setup.py | 4 ++-- src/main.cpp | 16 ++++++++++++++++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 35373d9..629a3a1 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 import cppjieba_py as jieba # or use defualt Tokenizer: jieba.cut jieba_instance = Tokenizer("cppjieba/dict/user.dict.utf8") -seg_list = jieba_instance.cut("我来到北京清华大学") +seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index f672d9d..6611da4 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -3,6 +3,7 @@ cut_internal as _cut_internal from libcppjieba import Tokenizer from libcppjieba import lcut,lcut_for_search,initialize +from libcppjieba import cut_all as _cut_all,lcut_all def _iter_wraps_doc(origin): return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1) @@ -11,6 +12,10 @@ def cut(*args,**kvargs): it = _cut_internal(*args,**kvargs) return iter(it) +def cut_all(*args,**kvargs): + it = _cut_all(*args,**kvargs) + return iter(it) + cut.__doc__ = _iter_wraps_doc(_cut_internal) def cut_for_search(*args,**kvargs): diff --git a/setup.py b/setup.py index 9346340..4247b22 100644 --- a/setup.py +++ b/setup.py @@ -97,10 +97,10 @@ def build_extensions(self): author='bung87,yeping zheng', url='https://github.com/bung87/cppjieba-py/', description='A python extension for cppjieba', - long_description='', + long_description= open("README.md").read(), ext_modules=ext_modules, packages=['cppjieba_py'], install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, - # zip_safe=False, + zip_safe=False, ) diff --git a/src/main.cpp b/src/main.cpp index 4b40f5c..554bbd2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -57,6 +57,13 @@ struct Tokenizer return words; }; + vector cut_all(const string &sentence) + { + vector words; + jieba.CutAll(sentence, words); + return words; + }; + vector lcut_all(const string &sentence) { vector words; @@ -181,6 +188,13 @@ WordVector cut_for_search_internal(const string &sentence, bool HMM = true) return dt->cut_for_search_internal(sentence, HMM); }; +vector cut_all(const string &sentence) +{ + init_check(); + return dt->cut_all(sentence); +}; + + vector lcut_for_search(const string &sentence, bool HMM = true) { init_check(); @@ -195,6 +209,7 @@ PYBIND11_MODULE(libcppjieba, m) m.def("cut_internal", &Jieba::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); m.def("lcut", &Jieba::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("cut_all", &Jieba::cut_all); m.def("lcut_all", &Jieba::lcut_all); m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); @@ -211,6 +226,7 @@ PYBIND11_MODULE(libcppjieba, m) .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut_all", &Tokenizer::lcut_all) + .def("cut_all", &Tokenizer::cut_all) .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) .def("tag_internal", &Tokenizer::tag_internal, py::arg("sentence")); From d36a1952e3565d2a414efc5c91ff342dac67adad Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 00:40:15 +0800 Subject: [PATCH 14/63] analyse.textrank --- cppjieba_py/__init__.py | 3 -- cppjieba_py/analyse.py | 14 +++++++ example.py | 5 ++- src/main.cpp | 81 ++++++++++++++++++++++++++++++++++------- 4 files changed, 86 insertions(+), 17 deletions(-) create mode 100644 cppjieba_py/analyse.py diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index 6611da4..47866a2 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -39,6 +39,3 @@ def _c_cut_for_search(ins,*args,**kvargs): setattr(Tokenizer,"cut",_c_cut) setattr(Tokenizer,"cut_for_search",_c_cut_for_search) -from libcppjieba import get_default_analyse - -analyse = get_default_analyse() \ No newline at end of file diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py new file mode 100644 index 0000000..cc90c37 --- /dev/null +++ b/cppjieba_py/analyse.py @@ -0,0 +1,14 @@ +from libcppjieba import get_default_keywordExtractor as _get_default_keywordExtractor,\ + get_default_textrank_extractor as _get_default_textrank_extractor + +keywordExtractor = _get_default_keywordExtractor() +textrankExtractor = _get_default_textrank_extractor() + +def extract_tags(sentence,topK = 20): + return keywordExtractor.extract_tags(sentence,topK) + +def textrank(sentence, topK=20, withWeight=False): + if not withWeight: + return textrankExtractor.textrank(sentence,topK) + else: + return textrankExtractor.textrank_with_weight(sentence,topK) \ No newline at end of file diff --git a/example.py b/example.py index 6e22302..2721d5f 100644 --- a/example.py +++ b/example.py @@ -34,6 +34,9 @@ def main(): s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" r = analyse.extract_tags(s) print(r) - + + r = analyse.textrank(s, withWeight=True) + print(r) + if __name__ == '__main__': main() diff --git a/src/main.cpp b/src/main.cpp index 554bbd2..697e15b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,6 +2,7 @@ #include #include #include "cppjieba/Jieba.hpp" +#include "cppjieba/TextRankExtractor.hpp" #include namespace py = pybind11; @@ -95,14 +96,14 @@ struct Tokenizer namespace Jieba { -struct Analyse +struct KeyowrdExtractor { private: Tokenizer *tokenizer; cppjieba::KeywordExtractor *keywordExtractor; public: - Analyse(Tokenizer *t) : tokenizer(t) + KeyowrdExtractor(Tokenizer *t) : tokenizer(t) { initKeyowrdExtractor(); }; @@ -121,8 +122,42 @@ struct Analyse }; }; +struct TextRankExtractor +{ + private: + Tokenizer *tokenizer; + cppjieba::TextRankExtractor *textRankExtractor; + + public: + TextRankExtractor(Tokenizer *t) : tokenizer(t) + { + initTextRankExtractor(); + }; + + vector textrank(const string &sentence, size_t topK = 20) + { + vector keywords; + textRankExtractor->Extract(sentence, keywords, topK); + return keywords; + }; + + vector> textrank_with_weight(const string &sentence, size_t topK = 20) + { + vector> keywords; + textRankExtractor->Extract(sentence, keywords, topK); + return keywords; + }; + + void initTextRankExtractor(const string &stopWordPath = STOP_WORD_PATH) + + { + textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), stopWordPath); + }; +}; + Tokenizer *dt; -Analyse *analyse; +KeyowrdExtractor *keywordExtractor; +TextRankExtractor *textRankExtractor; void initialize() { @@ -144,18 +179,32 @@ Tokenizer *get_default_tokenizer() return dt; }; -void init_check_analyse() +void init_check_textrank_extractor() { - if (!analyse) + if (!textRankExtractor) { - analyse = new Analyse(get_default_tokenizer()); + textRankExtractor = new TextRankExtractor(get_default_tokenizer()); } }; -Analyse *get_default_analyse() +TextRankExtractor *get_default_textrank_extractor() { - init_check_analyse(); - return analyse; + init_check_textrank_extractor(); + return textRankExtractor; +}; + +void init_check_keywordExtractor() +{ + if (!keywordExtractor) + { + keywordExtractor = new KeyowrdExtractor(get_default_tokenizer()); + } +}; + +KeyowrdExtractor *get_default_keywordExtractor() +{ + init_check_keywordExtractor(); + return keywordExtractor; }; WordsTaged tag_internal(const string &sentence) @@ -194,7 +243,6 @@ vector cut_all(const string &sentence) return dt->cut_all(sentence); }; - vector lcut_for_search(const string &sentence, bool HMM = true) { init_check(); @@ -215,10 +263,17 @@ PYBIND11_MODULE(libcppjieba, m) m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); m.def("tag_internal", &Jieba::tag_internal, py::arg("sentence")); m.def("initialize", &Jieba::initialize); - m.def("get_default_analyse", &Jieba::get_default_analyse); - py::class_(m, "Analyse") + m.def("get_default_keywordExtractor", &Jieba::get_default_keywordExtractor); + m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); + + py::class_(m, "KeyowrdExtractor") + .def(py::init()) + .def("extract_tags", &Jieba::KeyowrdExtractor::extract_tags, py::arg("sentence"), py::arg("topK") = 20); + + py::class_(m, "TextRankExtractor") .def(py::init()) - .def("extract_tags", &Jieba::Analyse::extract_tags, py::arg("sentence"), py::arg("topK") = 20); + .def("textrank_with_weight", &Jieba::TextRankExtractor::textrank_with_weight, py::arg("sentence"), py::arg("topK") = 20) + .def("textrank", &Jieba::TextRankExtractor::textrank, py::arg("sentence"), py::arg("topK") = 20); py::class_(m, "Tokenizer") .def(py::init<>()) From 74854b3fd52c1a7290d4e7efe0dc10d1a58a226e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 00:57:30 +0800 Subject: [PATCH 15/63] typo --- src/main.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 697e15b..a5d6baa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -96,14 +96,14 @@ struct Tokenizer namespace Jieba { -struct KeyowrdExtractor +struct KeywordExtractor { private: Tokenizer *tokenizer; cppjieba::KeywordExtractor *keywordExtractor; public: - KeyowrdExtractor(Tokenizer *t) : tokenizer(t) + KeywordExtractor(Tokenizer *t) : tokenizer(t) { initKeyowrdExtractor(); }; @@ -156,7 +156,7 @@ struct TextRankExtractor }; Tokenizer *dt; -KeyowrdExtractor *keywordExtractor; +KeywordExtractor *keywordExtractor; TextRankExtractor *textRankExtractor; void initialize() @@ -197,11 +197,11 @@ void init_check_keywordExtractor() { if (!keywordExtractor) { - keywordExtractor = new KeyowrdExtractor(get_default_tokenizer()); + keywordExtractor = new KeywordExtractor(get_default_tokenizer()); } }; -KeyowrdExtractor *get_default_keywordExtractor() +KeywordExtractor *get_default_keywordExtractor() { init_check_keywordExtractor(); return keywordExtractor; @@ -266,9 +266,9 @@ PYBIND11_MODULE(libcppjieba, m) m.def("get_default_keywordExtractor", &Jieba::get_default_keywordExtractor); m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); - py::class_(m, "KeyowrdExtractor") + py::class_(m, "KeywordExtractor") .def(py::init()) - .def("extract_tags", &Jieba::KeyowrdExtractor::extract_tags, py::arg("sentence"), py::arg("topK") = 20); + .def("extract_tags", &Jieba::KeywordExtractor::extract_tags, py::arg("sentence"), py::arg("topK") = 20); py::class_(m, "TextRankExtractor") .def(py::init()) From 449d47b6a7c557a4d2ed87c5fbcdbd9333a1cc49 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 01:16:05 +0800 Subject: [PATCH 16/63] v0.0.5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4247b22..d14511f 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import setuptools import os -__version__ = '0.0.4' +__version__ = '0.0.5' class get_pybind_include(object): """Helper class to determine the pybind11 include path From cb2d7e614eea549dcb22857289b3deefa99a8497 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 03:43:43 +0800 Subject: [PATCH 17/63] o --- cppjieba_py/analyse.py | 22 ++++++++++++++------- example.py | 8 ++++++++ src/main.cpp | 43 +++++++++++++++++++++++++++--------------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py index cc90c37..2e4a193 100644 --- a/cppjieba_py/analyse.py +++ b/cppjieba_py/analyse.py @@ -1,14 +1,22 @@ from libcppjieba import get_default_keywordExtractor as _get_default_keywordExtractor,\ get_default_textrank_extractor as _get_default_textrank_extractor -keywordExtractor = _get_default_keywordExtractor() -textrankExtractor = _get_default_textrank_extractor() +from libcppjieba import KeywordExtractor ,\ + TextRankExtractor as TextRank -def extract_tags(sentence,topK = 20): - return keywordExtractor.extract_tags(sentence,topK) +TFIDF = KeywordExtractor -def textrank(sentence, topK=20, withWeight=False): +def _textrank(self,sentence, topK=20, withWeight=False): if not withWeight: - return textrankExtractor.textrank(sentence,topK) + return self.textrank_no_weight(sentence,topK) else: - return textrankExtractor.textrank_with_weight(sentence,topK) \ No newline at end of file + return self.textrank_with_weight(sentence,topK) + +setattr(TextRank,"textrank",_textrank) + +keywordExtractor = _get_default_keywordExtractor() +textrankExtractor = _get_default_textrank_extractor() + +extract_tags = keywordExtractor.extract_tags +textrank = textrankExtractor.textrank + diff --git a/example.py b/example.py index 2721d5f..51d424f 100644 --- a/example.py +++ b/example.py @@ -2,6 +2,7 @@ import cppjieba_py.posseg as pseg import datetime from cppjieba_py import analyse +from cppjieba_py.analyse import TextRank,TFIDF def main(): @@ -38,5 +39,12 @@ def main(): r = analyse.textrank(s, withWeight=True) print(r) + tr = TextRank(jieba_instance) + print(tr.textrank(s,topK=2,withWeight=True)) + + tf = TFIDF(jieba_instance) + print(tf.extract_tags(s,topK=10)) + + if __name__ == '__main__': main() diff --git a/src/main.cpp b/src/main.cpp index a5d6baa..0356eb8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -102,24 +102,30 @@ struct KeywordExtractor Tokenizer *tokenizer; cppjieba::KeywordExtractor *keywordExtractor; + void initKeyowrdExtractor(const string &idfPath = IDF_PATH, + const string &stopWordPath = STOP_WORD_PATH) + { + keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), idfPath, stopWordPath); + }; + public: KeywordExtractor(Tokenizer *t) : tokenizer(t) { initKeyowrdExtractor(); }; + KeywordExtractor(Tokenizer *t, const string &idfPath, + const string &stopWordPath) : tokenizer(t) + { + initKeyowrdExtractor(idfPath, stopWordPath); + }; + vector extract_tags(const string &sentence, size_t topK = 20) { vector keywords; keywordExtractor->Extract(sentence, keywords, topK); return keywords; }; - - void initKeyowrdExtractor(const string &idfPath = IDF_PATH, - const string &stopWordPath = STOP_WORD_PATH) - { - keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), idfPath, stopWordPath); - }; }; struct TextRankExtractor @@ -128,13 +134,24 @@ struct TextRankExtractor Tokenizer *tokenizer; cppjieba::TextRankExtractor *textRankExtractor; + void initTextRankExtractor(const string &stopWordPath = STOP_WORD_PATH) + + { + textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), stopWordPath); + }; + public: TextRankExtractor(Tokenizer *t) : tokenizer(t) { initTextRankExtractor(); }; - vector textrank(const string &sentence, size_t topK = 20) + TextRankExtractor(Tokenizer *t, const string &stopWordPath) : tokenizer(t) + { + initTextRankExtractor(stopWordPath); + }; + + vector textrank_no_weight(const string &sentence, size_t topK = 20) { vector keywords; textRankExtractor->Extract(sentence, keywords, topK); @@ -147,12 +164,6 @@ struct TextRankExtractor textRankExtractor->Extract(sentence, keywords, topK); return keywords; }; - - void initTextRankExtractor(const string &stopWordPath = STOP_WORD_PATH) - - { - textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), stopWordPath); - }; }; Tokenizer *dt; @@ -268,12 +279,14 @@ PYBIND11_MODULE(libcppjieba, m) py::class_(m, "KeywordExtractor") .def(py::init()) + .def(py::init()) .def("extract_tags", &Jieba::KeywordExtractor::extract_tags, py::arg("sentence"), py::arg("topK") = 20); py::class_(m, "TextRankExtractor") .def(py::init()) - .def("textrank_with_weight", &Jieba::TextRankExtractor::textrank_with_weight, py::arg("sentence"), py::arg("topK") = 20) - .def("textrank", &Jieba::TextRankExtractor::textrank, py::arg("sentence"), py::arg("topK") = 20); + .def(py::init()) + .def("textrank_no_weight", &Jieba::TextRankExtractor::textrank_no_weight, py::arg("sentence"), py::arg("topK") = 20) + .def("textrank_with_weight", &Jieba::TextRankExtractor::textrank_with_weight, py::arg("sentence"), py::arg("topK") = 20); py::class_(m, "Tokenizer") .def(py::init<>()) From ce2984e65ab932ab31a0b8c75b30a2c3147c38a4 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 15:34:10 +0800 Subject: [PATCH 18/63] .add_word ,tokenize ,and clean names --- cppjieba_py/__init__.py | 27 ++++++++------- cppjieba_py/analyse.py | 8 ++--- cppjieba_py/posseg.py | 8 ++--- example.py | 4 +++ src/main.cpp | 73 ++++++++++++++++++++++++++++++++++------- 5 files changed, 87 insertions(+), 33 deletions(-) diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index 47866a2..c2e99a3 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,7 +1,6 @@ -from libcppjieba import cut_for_search_internal as _cut_for_search_internal,\ - tag_internal as _tag_internal,\ - cut_internal as _cut_internal -from libcppjieba import Tokenizer + +import libcppjieba +from libcppjieba import Tokenizer,add_word from libcppjieba import lcut,lcut_for_search,initialize from libcppjieba import cut_all as _cut_all,lcut_all @@ -9,33 +8,33 @@ def _iter_wraps_doc(origin): return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1) def cut(*args,**kvargs): - it = _cut_internal(*args,**kvargs) + it = libcppjieba.cut(*args,**kvargs) return iter(it) def cut_all(*args,**kvargs): it = _cut_all(*args,**kvargs) return iter(it) -cut.__doc__ = _iter_wraps_doc(_cut_internal) +cut.__doc__ = _iter_wraps_doc(libcppjieba.cut) def cut_for_search(*args,**kvargs): - it = _cut_for_search_internal(*args,**kvargs) + it = libcppjieba.cut_for_search(*args,**kvargs) return iter(it) -cut_for_search.__doc__ = _iter_wraps_doc(_cut_for_search_internal) +cut_for_search.__doc__ = _iter_wraps_doc(libcppjieba.cut_for_search) -def _c_cut(ins,*args,**kvargs): +def _cut(ins,*args,**kvargs): it = ins.cut_internal(*args,**kvargs) return iter(it) -def _c_cut_for_search(ins,*args,**kvargs): +def _cut_for_search(ins,*args,**kvargs): it = ins.cut_for_search_internal(*args,**kvargs) return iter(it) -_c_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal) +_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal) -_c_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal) +_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal) -setattr(Tokenizer,"cut",_c_cut) -setattr(Tokenizer,"cut_for_search",_c_cut_for_search) +setattr(Tokenizer,"cut",_cut) +setattr(Tokenizer,"cut_for_search",_cut_for_search) diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py index 2e4a193..bfe8b9a 100644 --- a/cppjieba_py/analyse.py +++ b/cppjieba_py/analyse.py @@ -1,5 +1,5 @@ -from libcppjieba import get_default_keywordExtractor as _get_default_keywordExtractor,\ - get_default_textrank_extractor as _get_default_textrank_extractor +from libcppjieba import get_default_keyword_extractor ,\ + get_default_textrank_extractor from libcppjieba import KeywordExtractor ,\ TextRankExtractor as TextRank @@ -14,8 +14,8 @@ def _textrank(self,sentence, topK=20, withWeight=False): setattr(TextRank,"textrank",_textrank) -keywordExtractor = _get_default_keywordExtractor() -textrankExtractor = _get_default_textrank_extractor() +keywordExtractor = get_default_keyword_extractor() +textrankExtractor = get_default_textrank_extractor() extract_tags = keywordExtractor.extract_tags textrank = textrankExtractor.textrank diff --git a/cppjieba_py/posseg.py b/cppjieba_py/posseg.py index e4f3078..e8f3dd8 100644 --- a/cppjieba_py/posseg.py +++ b/cppjieba_py/posseg.py @@ -1,8 +1,8 @@ -from libcppjieba import tag_internal + +import libcppjieba def cut(sentence): - it = tag_internal(sentence) + it = libcppjieba.tag(sentence) return iter(it) -def lcut(sentence): - return tag_internal(sentence) \ No newline at end of file +lcut = libcppjieba.tag \ No newline at end of file diff --git a/example.py b/example.py index 51d424f..dcdbbdc 100644 --- a/example.py +++ b/example.py @@ -45,6 +45,10 @@ def main(): tf = TFIDF(jieba_instance) print(tf.extract_tags(s,topK=10)) + result = jieba_instance.tokenize('永和服装饰品有限公司') + for tk in result: + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + if __name__ == '__main__': main() diff --git a/src/main.cpp b/src/main.cpp index 0356eb8..61d545a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,11 @@ const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; using namespace std; -// PYBIND11_MAKE_OPAQUE(std::vector); +// PYBIND11_MAKE_OPAQUE(std::vector); using Word = cppjieba::Word; -using WordVector = std::vector; +using WordVector = vector; using WordsTaged = vector>; @@ -28,6 +28,28 @@ struct Tokenizer public: Tokenizer(const string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; + vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) + { + vector> result; + vector words; + if (mode.compare("default") == 0) + { + jieba.Cut(sentence, words, HMM); + } + else + { + jieba.CutForSearch(sentence, words, HMM); + } + + vector::const_iterator it; + it = words.begin(); + while (it != words.end()) + { + result.push_back(make_tuple(it->word, it->unicode_offset, it->unicode_offset + it->unicode_length)); + ++it; + } + return result; + }; WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) { @@ -92,6 +114,11 @@ struct Tokenizer jieba.Tag(sentence, words); return words; }; + + bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) + { + return jieba.InsertUserWord(word, tag); + }; }; namespace Jieba @@ -212,19 +239,19 @@ void init_check_keywordExtractor() } }; -KeywordExtractor *get_default_keywordExtractor() +KeywordExtractor *get_default_keyword_extractor() { init_check_keywordExtractor(); return keywordExtractor; }; -WordsTaged tag_internal(const string &sentence) +WordsTaged tag(const string &sentence) { init_check(); return dt->tag_internal(sentence); }; -WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) +WordVector cut(const string &sentence, bool cut_all = false, bool HMM = true) { init_check(); return dt->cut_internal(sentence, cut_all, HMM); @@ -242,7 +269,7 @@ vector lcut_all(const string &sentence) return dt->lcut_all(sentence); }; -WordVector cut_for_search_internal(const string &sentence, bool HMM = true) +WordVector cut_for_search(const string &sentence, bool HMM = true) { init_check(); return dt->cut_for_search_internal(sentence, HMM); @@ -260,22 +287,35 @@ vector lcut_for_search(const string &sentence, bool HMM = true) return dt->lcut_for_search(sentence, HMM); }; +bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) +{ + init_check(); + return dt->add_word(word, tag); +}; + +vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true){ + init_check(); + return dt->tokenize(sentence,mode, HMM); +}; + }; // namespace Jieba PYBIND11_MODULE(libcppjieba, m) { m.doc() = "python extension for cppjieba"; // optional module docstring - m.def("cut_internal", &Jieba::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("cut", &Jieba::cut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); m.def("lcut", &Jieba::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); m.def("cut_all", &Jieba::cut_all); m.def("lcut_all", &Jieba::lcut_all); m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); - m.def("cut_for_search_internal", &Jieba::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true); - m.def("tag_internal", &Jieba::tag_internal, py::arg("sentence")); + m.def("cut_for_search", &Jieba::cut_for_search, py::arg("sentence"), py::arg("HMM") = true); + m.def("tag", &Jieba::tag, py::arg("sentence")); m.def("initialize", &Jieba::initialize); - m.def("get_default_keywordExtractor", &Jieba::get_default_keywordExtractor); + m.def("get_default_keyword_extractor", &Jieba::get_default_keyword_extractor); m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); + m.def("add_word", &Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); + m.def("tokenize", &Jieba::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); py::class_(m, "KeywordExtractor") .def(py::init()) @@ -297,5 +337,16 @@ PYBIND11_MODULE(libcppjieba, m) .def("cut_all", &Tokenizer::cut_all) .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) - .def("tag_internal", &Tokenizer::tag_internal, py::arg("sentence")); + .def("tag_internal", &Tokenizer::tag_internal, py::arg("sentence")) + .def("add_word", &Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) + .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); + + // py::class_(m, "Word") + // .def_readonly("word", &Word::word) + // .def("__str__", [](const Word &v) { + // return v.word; + // }) + // .def("__repr__", [](const Word &v) { + // return v.word; + // }); } \ No newline at end of file From 11122950f7e4cce579f7bc7ebe216b7b002c3f8e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 16:22:12 +0800 Subject: [PATCH 19/63] update version,doc --- README.md | 16 +++++++++++++++- cppjieba_py/__init__.py | 2 +- example.py | 4 +++- setup.py | 4 ++-- src/main.cpp | 6 +++--- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 629a3a1..cc1aec5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,20 @@ # cppjieba-py -cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 +cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 + +由于只是对cppjieba的接口进行的封装,所以执行效率上非常接近于原cppjieba。 + +项目主要分为两个部分**libcppjieba** 为 cppjieba 的 python extension, +**cppjieba_py** 为使开发者平滑过渡到使用cppjieba-py而作的 python package。 +具体见[example.py](example.py)。 + +### 区别 + +* 原jieba的`.cut`族接口基本都由python的`iter()`函数包裹list结果来实现。 +* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。 +* `.add_word` 由于cppjieba没提供**freq**参数,实现上不一致。 +* `.del_word` 和`.suggest_freq` cppjieba没提供。 +* `POSTokenizer.lcut` 在`Tokenizer.tag` 下, 唯一一个只提供了list返回类型的接口。 ## 安装 diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index c2e99a3..92cc5ef 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,6 +1,6 @@ import libcppjieba -from libcppjieba import Tokenizer,add_word +from libcppjieba import Tokenizer,add_word,tokenize from libcppjieba import lcut,lcut_for_search,initialize from libcppjieba import cut_all as _cut_all,lcut_all diff --git a/example.py b/example.py index dcdbbdc..7e779cc 100644 --- a/example.py +++ b/example.py @@ -1,4 +1,4 @@ -from cppjieba_py import Tokenizer, cut, cut_for_search, lcut, lcut_for_search, initialize +from cppjieba_py import Tokenizer, cut, tokenize, cut_for_search, lcut, lcut_for_search, initialize import cppjieba_py.posseg as pseg import datetime from cppjieba_py import analyse @@ -49,6 +49,8 @@ def main(): for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + print(tokenize('永和服装饰品有限公司',mode="search")) + if __name__ == '__main__': main() diff --git a/setup.py b/setup.py index d14511f..5828c0b 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import setuptools import os -__version__ = '0.0.5' +__version__ = '0.0.6' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -96,7 +96,7 @@ def build_extensions(self): version=__version__, author='bung87,yeping zheng', url='https://github.com/bung87/cppjieba-py/', - description='A python extension for cppjieba', + description='python bindings of cppjieba', long_description= open("README.md").read(), ext_modules=ext_modules, packages=['cppjieba_py'], diff --git a/src/main.cpp b/src/main.cpp index 61d545a..4fe4fc3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -108,7 +108,7 @@ struct Tokenizer return words; }; - WordsTaged tag_internal(const string &sentence) + WordsTaged tag(const string &sentence) { WordsTaged words; jieba.Tag(sentence, words); @@ -248,7 +248,7 @@ KeywordExtractor *get_default_keyword_extractor() WordsTaged tag(const string &sentence) { init_check(); - return dt->tag_internal(sentence); + return dt->tag(sentence); }; WordVector cut(const string &sentence, bool cut_all = false, bool HMM = true) @@ -337,7 +337,7 @@ PYBIND11_MODULE(libcppjieba, m) .def("cut_all", &Tokenizer::cut_all) .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) - .def("tag_internal", &Tokenizer::tag_internal, py::arg("sentence")) + .def("tag", &Tokenizer::tag, py::arg("sentence")) .def("add_word", &Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); From 1bc799aa13e83d87b3db02a91bd6fdebaa08ce25 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 7 Jun 2018 16:29:35 +0800 Subject: [PATCH 20/63] update readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cc1aec5..c41c772 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,10 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 * pypi ```pip install cppjieba-py``` + + 或者你设置的安装源并未收录本项目 + + ```pip install -i https://pypi.org/simple/ cppjieba-py``` * 从发行包安装 see [releases](https://github.com/bung87/cppjieba-py/releases) @@ -31,7 +35,6 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 ``` $ git clone --recursive https://github.com/bung87/cppjieba-py - $ python setup.py build $ python setup.py install ``` From 83bd0552c32deb5f687349f4fa1827d25aec876d Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 18:25:38 +0800 Subject: [PATCH 21/63] change cppjieba to mine fork version in case needs modify the cpp files --- .gitmodules | 2 +- cppjieba | 2 +- pybind11 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index cce53b4..eff9268 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,4 +3,4 @@ url = https://github.com/pybind/pybind11.git [submodule "cppjieba"] path = cppjieba - url = https://github.com/yanyiwu/cppjieba.git + url = https://github.com/bung87/cppjieba.git diff --git a/cppjieba b/cppjieba index 6aff1f6..1e1e585 160000 --- a/cppjieba +++ b/cppjieba @@ -1 +1 @@ -Subproject commit 6aff1f637c784c27af6bb0868a94ba22617e65b0 +Subproject commit 1e1e585194d0816b369464890db7779436d5da21 diff --git a/pybind11 b/pybind11 index a303c6f..55dc131 160000 --- a/pybind11 +++ b/pybind11 @@ -1 +1 @@ -Subproject commit a303c6fc479662fd53eaa8990dbc65b7de9b7deb +Subproject commit 55dc131944c764ba7e30085b971a9d70531114b3 From c169c543801e56ee4f103c24f5641e46b20ed795 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 18:25:59 +0800 Subject: [PATCH 22/63] add .load_userdict --- example.py | 5 ++++- src/main.cpp | 42 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/example.py b/example.py index 7e779cc..c11836c 100644 --- a/example.py +++ b/example.py @@ -1,4 +1,4 @@ -from cppjieba_py import Tokenizer, cut, tokenize, cut_for_search, lcut, lcut_for_search, initialize +from cppjieba_py import Tokenizer, cut, tokenize, cut_for_search, lcut, lcut_for_search, initialize, load_userdict import cppjieba_py.posseg as pseg import datetime from cppjieba_py import analyse @@ -51,6 +51,9 @@ def main(): print(tokenize('永和服装饰品有限公司',mode="search")) + jieba_instance.load_userdict(["卧槽"]) + + load_userdict(set(["卧槽"])) if __name__ == '__main__': main() diff --git a/src/main.cpp b/src/main.cpp index 4fe4fc3..9813eef 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,8 +26,9 @@ struct Tokenizer cppjieba::Jieba jieba; public: - Tokenizer(const string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; + Tokenizer(const string& main_dict) : jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; + Tokenizer(const string& main_dict,const string& user_dict) : jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH){}; vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) { vector> result; @@ -51,6 +52,18 @@ struct Tokenizer return result; }; + void load_userdict(const vector& buf){ + jieba.LoadUserDict(buf); + }; + + void load_userdict(const set& buf){ + jieba.LoadUserDict(buf); + }; + + void load_userdict(const string& path){ + jieba.LoadUserDict(path); + }; + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) { WordVector words; @@ -298,6 +311,21 @@ vector> tokenize(const string &sentence, const return dt->tokenize(sentence,mode, HMM); }; +void load_userdict2(const vector& buf){ + init_check(); + dt->load_userdict(buf); +}; + +void load_userdict3(const set& buf){ + init_check(); + dt->load_userdict(buf); +}; + +void load_userdict(const string& path){ + init_check(); + dt->load_userdict(path); +}; + }; // namespace Jieba PYBIND11_MODULE(libcppjieba, m) @@ -316,6 +344,9 @@ PYBIND11_MODULE(libcppjieba, m) m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); m.def("add_word", &Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); m.def("tokenize", &Jieba::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); + m.def("load_userdict",(void (*)(const vector&) ) &Jieba::load_userdict2); + m.def("load_userdict",(void (*)(const set&) ) &Jieba::load_userdict3); + m.def("load_userdict",(void (*)(const string&) ) &Jieba::load_userdict); py::class_(m, "KeywordExtractor") .def(py::init()) @@ -330,7 +361,8 @@ PYBIND11_MODULE(libcppjieba, m) py::class_(m, "Tokenizer") .def(py::init<>()) - .def(py::init()) + .def(py::init()) + .def(py::init()) .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut_all", &Tokenizer::lcut_all) @@ -339,8 +371,10 @@ PYBIND11_MODULE(libcppjieba, m) .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) .def("tag", &Tokenizer::tag, py::arg("sentence")) .def("add_word", &Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) - .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); - + .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true) + .def("load_userdict",(void (Tokenizer::*)(const vector&) ) &Tokenizer::load_userdict) + .def("load_userdict",(void (Tokenizer::*)(const string&) ) &Tokenizer::load_userdict) + .def("load_userdict",(void (Tokenizer::*)(const set&) ) &Tokenizer::load_userdict); // py::class_(m, "Word") // .def_readonly("word", &Word::word) // .def("__str__", [](const Word &v) { From f30e32f9c139046b378fa3dff6e6dee0ace34e49 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 19:52:37 +0800 Subject: [PATCH 23/63] fix while install locally --- setup.py | 6 +++++- src/main.cpp | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 5828c0b..d97a0be 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,11 @@ def build_extensions(self): description='python bindings of cppjieba', long_description= open("README.md").read(), ext_modules=ext_modules, - packages=['cppjieba_py'], + packages=['cppjieba_py','cppjieba.dict'], + package_data = { + 'cppjieba.dict': ['*.utf8'] + }, + include_package_data=True, install_requires=['pybind11>=2.2'], cmdclass={'build_ext': BuildExt}, zip_safe=False, diff --git a/src/main.cpp b/src/main.cpp index 9813eef..42fae3f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,10 +7,10 @@ namespace py = pybind11; -const std::string DICT_PATH = "cppjieba/dict/jieba.dict.utf8"; -const std::string HMM_PATH = "cppjieba/dict/hmm_model.utf8"; -const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; -const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; +const std::string DICT_PATH = "../cppjieba/dict/jieba.dict.utf8"; +const std::string HMM_PATH = "../cppjieba/dict/hmm_model.utf8"; +const std::string IDF_PATH = "../cppjieba/dict/idf.utf8"; +const std::string STOP_WORD_PATH = "../cppjieba/dict/stop_words.utf8"; using namespace std; // PYBIND11_MAKE_OPAQUE(std::vector); From 811113c36a45e487f000e14fbe910c6ffe9d937f Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 20:42:18 +0800 Subject: [PATCH 24/63] o --- README.md | 2 +- cppjieba_py/__init__.py | 2 +- src/main.cpp | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c41c772..e893374 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 # -*- coding: utf-8 -*- import cppjieba_py as jieba # or use defualt Tokenizer: jieba.cut -jieba_instance = Tokenizer("cppjieba/dict/user.dict.utf8") +jieba_instance = Tokenizer("cppjieba/dict/jieba.dict.utf8") seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index 92cc5ef..013cc23 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,6 +1,6 @@ import libcppjieba -from libcppjieba import Tokenizer,add_word,tokenize +from libcppjieba import Tokenizer,add_word,tokenize,load_userdict from libcppjieba import lcut,lcut_for_search,initialize from libcppjieba import cut_all as _cut_all,lcut_all diff --git a/src/main.cpp b/src/main.cpp index 42fae3f..c2094b9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -29,6 +29,7 @@ struct Tokenizer Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; Tokenizer(const string& main_dict) : jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; Tokenizer(const string& main_dict,const string& user_dict) : jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH){}; + Tokenizer(const string& main_dict,const string& user_dict,const string& stop_word_path) : jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path){}; vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) { vector> result; @@ -363,6 +364,7 @@ PYBIND11_MODULE(libcppjieba, m) .def(py::init<>()) .def(py::init()) .def(py::init()) + .def(py::init()) .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut_all", &Tokenizer::lcut_all) From 74dbccf2c33e515753490f1384482344b9d1f1c7 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Fri, 8 Jun 2018 23:50:51 +0800 Subject: [PATCH 25/63] fix dict runtime relative path --- setup.py | 7 ++++ src/main.cpp | 113 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/setup.py b/setup.py index d97a0be..4836922 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,9 @@ import setuptools import os +from distutils.sysconfig import get_python_lib +site_package_dir = get_python_lib() + os.path.sep + __version__ = '0.0.6' class get_pybind_include(object): @@ -80,12 +83,16 @@ def build_extensions(self): if ct == 'unix': opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) + opts.append('-DSITE_PACKAGE_PATH="%s"' % + site_package_dir) opts.append(cpp_flag(self.compiler)) if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') elif ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + opts.append('/DSITE_PACKAGE_PATH=\\"%s\\"' % + site_package_dir) for ext in self.extensions: ext.extra_compile_args = opts build_ext.build_extensions(self) diff --git a/src/main.cpp b/src/main.cpp index c2094b9..f6a1fb3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,3 +1,6 @@ +#ifndef SITE_PACKAGE_PATH +#define SITE_PACKAGE_PATH STR_VALUE(SITE_PACKAGE_PATH) +#endif #include #include #include @@ -5,15 +8,13 @@ #include "cppjieba/TextRankExtractor.hpp" #include -namespace py = pybind11; - -const std::string DICT_PATH = "../cppjieba/dict/jieba.dict.utf8"; -const std::string HMM_PATH = "../cppjieba/dict/hmm_model.utf8"; -const std::string IDF_PATH = "../cppjieba/dict/idf.utf8"; -const std::string STOP_WORD_PATH = "../cppjieba/dict/stop_words.utf8"; using namespace std; +namespace py = pybind11; -// PYBIND11_MAKE_OPAQUE(std::vector); +const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/jieba.dict.utf8"); +const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/hmm_model.utf8"); +const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/idf.utf8"); +const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/stop_words.utf8"); using Word = cppjieba::Word; @@ -23,24 +24,43 @@ using WordsTaged = vector>; struct Tokenizer { - cppjieba::Jieba jieba; + cppjieba::Jieba *jieba; public: - Tokenizer() : jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; - Tokenizer(const string& main_dict) : jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH){}; - Tokenizer(const string& main_dict,const string& user_dict) : jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH){}; - Tokenizer(const string& main_dict,const string& user_dict,const string& stop_word_path) : jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path){}; + Tokenizer() + { + + jieba = new cppjieba::Jieba( DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict) + { + + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict, const string &user_dict) + { + + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict, const string &user_dict, const string &stop_word_path) + { + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path); + }; + vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) { vector> result; vector words; if (mode.compare("default") == 0) { - jieba.Cut(sentence, words, HMM); + jieba->Cut(sentence, words, HMM); } else { - jieba.CutForSearch(sentence, words, HMM); + jieba->CutForSearch(sentence, words, HMM); } vector::const_iterator it; @@ -53,16 +73,19 @@ struct Tokenizer return result; }; - void load_userdict(const vector& buf){ - jieba.LoadUserDict(buf); + void load_userdict(const vector &buf) + { + jieba->LoadUserDict(buf); }; - void load_userdict(const set& buf){ - jieba.LoadUserDict(buf); + void load_userdict(const set &buf) + { + jieba->LoadUserDict(buf); }; - void load_userdict(const string& path){ - jieba.LoadUserDict(path); + void load_userdict(const string &path) + { + jieba->LoadUserDict(path); }; WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) @@ -70,11 +93,11 @@ struct Tokenizer WordVector words; if (cut_all) { - jieba.Cut(sentence, words, HMM); + jieba->Cut(sentence, words, HMM); } else { - jieba.CutAll(sentence, words); + jieba->CutAll(sentence, words); } return words; }; @@ -84,11 +107,11 @@ struct Tokenizer vector words; if (cut_all) { - jieba.Cut(sentence, words, HMM); + jieba->Cut(sentence, words, HMM); } else { - jieba.CutAll(sentence, words); + jieba->CutAll(sentence, words); } return words; @@ -97,41 +120,41 @@ struct Tokenizer vector cut_all(const string &sentence) { vector words; - jieba.CutAll(sentence, words); + jieba->CutAll(sentence, words); return words; }; vector lcut_all(const string &sentence) { vector words; - jieba.CutAll(sentence, words); + jieba->CutAll(sentence, words); return words; }; WordVector cut_for_search_internal(const string &sentence, bool HMM = true) { WordVector words; - jieba.CutForSearch(sentence, words, HMM); + jieba->CutForSearch(sentence, words, HMM); return words; }; vector lcut_for_search(const string &sentence, bool HMM = true) { vector words; - jieba.CutForSearch(sentence, words, HMM); + jieba->CutForSearch(sentence, words, HMM); return words; }; WordsTaged tag(const string &sentence) { WordsTaged words; - jieba.Tag(sentence, words); + jieba->Tag(sentence, words); return words; }; bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) { - return jieba.InsertUserWord(word, tag); + return jieba->InsertUserWord(word, tag); }; }; @@ -146,7 +169,7 @@ struct KeywordExtractor void initKeyowrdExtractor(const string &idfPath = IDF_PATH, const string &stopWordPath = STOP_WORD_PATH) { - keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), idfPath, stopWordPath); + keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba->GetDictTrie(), tokenizer->jieba->GetHMMModel(), idfPath, stopWordPath); }; public: @@ -178,7 +201,7 @@ struct TextRankExtractor void initTextRankExtractor(const string &stopWordPath = STOP_WORD_PATH) { - textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba.GetDictTrie(), tokenizer->jieba.GetHMMModel(), stopWordPath); + textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba->GetDictTrie(), tokenizer->jieba->GetHMMModel(), stopWordPath); }; public: @@ -307,22 +330,26 @@ bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) return dt->add_word(word, tag); }; -vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true){ +vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) +{ init_check(); - return dt->tokenize(sentence,mode, HMM); + return dt->tokenize(sentence, mode, HMM); }; -void load_userdict2(const vector& buf){ +void load_userdict2(const vector &buf) +{ init_check(); dt->load_userdict(buf); }; -void load_userdict3(const set& buf){ +void load_userdict3(const set &buf) +{ init_check(); dt->load_userdict(buf); }; -void load_userdict(const string& path){ +void load_userdict(const string &path) +{ init_check(); dt->load_userdict(path); }; @@ -345,9 +372,9 @@ PYBIND11_MODULE(libcppjieba, m) m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); m.def("add_word", &Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); m.def("tokenize", &Jieba::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); - m.def("load_userdict",(void (*)(const vector&) ) &Jieba::load_userdict2); - m.def("load_userdict",(void (*)(const set&) ) &Jieba::load_userdict3); - m.def("load_userdict",(void (*)(const string&) ) &Jieba::load_userdict); + m.def("load_userdict", (void (*)(const vector &)) & Jieba::load_userdict2); + m.def("load_userdict", (void (*)(const set &)) & Jieba::load_userdict3); + m.def("load_userdict", (void (*)(const string &)) & Jieba::load_userdict); py::class_(m, "KeywordExtractor") .def(py::init()) @@ -374,9 +401,9 @@ PYBIND11_MODULE(libcppjieba, m) .def("tag", &Tokenizer::tag, py::arg("sentence")) .def("add_word", &Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true) - .def("load_userdict",(void (Tokenizer::*)(const vector&) ) &Tokenizer::load_userdict) - .def("load_userdict",(void (Tokenizer::*)(const string&) ) &Tokenizer::load_userdict) - .def("load_userdict",(void (Tokenizer::*)(const set&) ) &Tokenizer::load_userdict); + .def("load_userdict", (void (Tokenizer::*)(const vector &)) & Tokenizer::load_userdict) + .def("load_userdict", (void (Tokenizer::*)(const string &)) & Tokenizer::load_userdict) + .def("load_userdict", (void (Tokenizer::*)(const set &)) & Tokenizer::load_userdict); // py::class_(m, "Word") // .def_readonly("word", &Word::word) // .def("__str__", [](const Word &v) { From 6de04ea8fe1c31e02b4ac44b659798b476c3e64a Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 07:35:07 +0800 Subject: [PATCH 26/63] add test and travis file --- .travis.yml | 18 ++++++++ cppjieba_py/analyse.py | 5 ++- nose.cfg | 4 ++ setup.py | 6 ++- tests/test_jieba.py | 53 ++++++++++++++++++++++++ tests/test_keyword_extractor.py | 40 ++++++++++++++++++ tests/test_textrank_extractor.py | 40 ++++++++++++++++++ tests/test_tokenizer.py | 70 ++++++++++++++++++++++++++++++++ 8 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 .travis.yml create mode 100644 nose.cfg create mode 100644 tests/test_jieba.py create mode 100644 tests/test_keyword_extractor.py create mode 100644 tests/test_textrank_extractor.py create mode 100644 tests/test_tokenizer.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f982770 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,18 @@ +language: python +python: +- '2.7' +- '3.4' +- '3.5' +- '3.6' +sudo: false +before_install: +- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +- sudo apt-get -y update +- sudo apt-get -y install build-essential +- sudo apt-get -y install g++-5 +- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1 +- g++ --version +install: +- pip install -e . +script: +- nosetests -c nose.cfg \ No newline at end of file diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py index bfe8b9a..bf4e5f5 100644 --- a/cppjieba_py/analyse.py +++ b/cppjieba_py/analyse.py @@ -1,9 +1,12 @@ +# pylint: disable=E0611 from libcppjieba import get_default_keyword_extractor ,\ get_default_textrank_extractor from libcppjieba import KeywordExtractor ,\ - TextRankExtractor as TextRank + TextRankExtractor +# pylint: enable=E0611 +TextRank = TextRankExtractor TFIDF = KeywordExtractor def _textrank(self,sentence, topK=20, withWeight=False): diff --git a/nose.cfg b/nose.cfg new file mode 100644 index 0000000..bc6caf5 --- /dev/null +++ b/nose.cfg @@ -0,0 +1,4 @@ +[nosetests] +where=tests +with-specplugin=1 +with-specselector=1 \ No newline at end of file diff --git a/setup.py b/setup.py index 4836922..3f987e6 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.6' +__version__ = '0.0.7' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -97,7 +97,6 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) - setup( name='cppjieba_py', version=__version__, @@ -112,6 +111,9 @@ def build_extensions(self): }, include_package_data=True, install_requires=['pybind11>=2.2'], + extras_require={ + 'test': ['spec'] + }, cmdclass={'build_ext': BuildExt}, zip_safe=False, ) diff --git a/tests/test_jieba.py b/tests/test_jieba.py new file mode 100644 index 0000000..37f774d --- /dev/null +++ b/tests/test_jieba.py @@ -0,0 +1,53 @@ +from spec import Spec +from pathlib import Path + +DICT_DIR = Path("../cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +USER_DICT = str(DICT_DIR / "user.dict.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +import cppjieba_py as jieba + +class JiebaTest(Spec): + + class cut: + + def takes_arg1_as_sentence(self): + jieba.cut("") + + def takes_arg2_as_cut_all(self): + jieba.cut("", True) + + def takes_arg3_as_HMM(self): + jieba.cut("", True, True) + + def returns_iterator(self): + from collections import Iterable, Sequence + r = jieba.cut("", True, True) + iterable = isinstance(r, Iterable) + sequence = isinstance(r, Sequence) + assert iterable and not sequence + + class lcut: + def takes_arg1_as_sentence(self): + jieba.cut("") + + def takes_arg2_as_cut_all(self): + jieba.cut("", True) + + def takes_arg3_as_HMM(self): + jieba.cut("", True, True) + + def returns_list(self): + r = jieba.lcut("", True, True) + assert isinstance(r, list) + + class load_userdict: + def accept_string_as_arg(self): + jieba.load_userdict("") + + def accept_list_as_arg(self): + jieba.load_userdict([]) + + def accept_set_as_arg(self): + jieba.load_userdict(set([])) diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py new file mode 100644 index 0000000..6b1f632 --- /dev/null +++ b/tests/test_keyword_extractor.py @@ -0,0 +1,40 @@ +# pylint: disable=E1101 +from spec import Spec +from pathlib import Path + +DICT_DIR = Path("../cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +IDF = str(DICT_DIR / "idf.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer +from cppjieba_py.analyse import KeywordExtractor + + +class KeywordExtractorrTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + cls.sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + cls.extractor = KeywordExtractor(KeywordExtractorrTest.dt) + + class init: + "__init__" + + def takes_arg1_as_tokenizer(self): + pass + + def takes_arg2_as_IDF_PATH_and_arg3_as_STOP_WORD_PATH(self): + KeywordExtractor(self.dt, IDF, STOP_WORD) + + class extract_tags: + + def takes_arg1_as_sentence(self): + self.extractor.extract_tags(self.sentence) + + def takes_arg2_as_topK(self): + self.extractor.extract_tags(self.sentence, topK=5) + + def returns_list(self): + r = self.extractor.extract_tags(self.sentence, topK=5) + assert isinstance(r, list) diff --git a/tests/test_textrank_extractor.py b/tests/test_textrank_extractor.py new file mode 100644 index 0000000..9c4b5aa --- /dev/null +++ b/tests/test_textrank_extractor.py @@ -0,0 +1,40 @@ +# pylint: disable=E1101 +from spec import Spec +from pathlib import Path + +DICT_DIR = Path("../cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +IDF = str(DICT_DIR / "idf.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer +from cppjieba_py.analyse import TextRankExtractor + + +class TextRankExtractorTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + cls.sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + cls.extractor = TextRankExtractor(cls.dt) + + class init: + "__init__" + + def takes_arg1_as_tokenizer(self): + pass + + def takes_arg2_as_STOP_WORD_PATH(self): + TextRankExtractor(self.dt, STOP_WORD) + + class textrank_no_weight: + + def takes_arg1_as_sentence(self): + self.extractor.textrank_no_weight(self.sentence) + + def takes_arg2_as_topK(self): + self.extractor.textrank_no_weight(self.sentence, topK=5) + + def returns_list(self): + r = self.extractor.textrank_no_weight(self.sentence, topK=5) + assert isinstance(r, list) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..fc4fe58 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,70 @@ +# pylint: disable=E1101 +from spec import Spec +from pathlib import Path + +DICT_DIR = Path("../cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +USER_DICT = str(DICT_DIR / "user.dict.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer + + +class TokenizerTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + + class init: + "__init__" + + def takes_arg1_as_main_dict_path(self): + pass + + def takes_arg2_as_user_dict_path(self): + Tokenizer(DICT, USER_DICT) + + def takes_arg3_as_stopword_path(self): + Tokenizer(DICT, USER_DICT, STOP_WORD) + + class cut: + + def takes_arg1_as_sentence(self): + self.dt.cut("") + + def takes_arg2_as_cut_all(self): + self.dt.cut("", True) + + def takes_arg3_as_HMM(self): + self.dt.cut("", True, True) + + def returns_iterator(self): + from collections import Iterable, Sequence + r = self.dt.cut("", True, True) + iterable = isinstance(r, Iterable) + sequence = isinstance(r, Sequence) + assert iterable and not sequence + + class lcut: + def takes_arg1_as_sentence(self): + self.dt.cut("") + + def takes_arg2_as_cut_all(self): + self.dt.cut("", True) + + def takes_arg3_as_HMM(self): + self.dt.cut("", True, True) + + def returns_list(self): + r = self.dt.lcut("", True, True) + assert isinstance(r, list) + + class load_userdict: + def accept_string_as_arg(self): + self.dt.load_userdict("") + + def accept_list_as_arg(self): + self.dt.load_userdict([]) + + def accept_set_as_arg(self): + self.dt.load_userdict(set([])) From 263cb849b99ed905a35545702feee8ef8ec32864 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 07:45:38 +0800 Subject: [PATCH 27/63] o --- README.md | 4 +++- setup.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e893374..fd10043 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# cppjieba-py +# cppjieba-py + +[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 diff --git a/setup.py b/setup.py index 3f987e6..d409ffa 100644 --- a/setup.py +++ b/setup.py @@ -112,7 +112,7 @@ def build_extensions(self): include_package_data=True, install_requires=['pybind11>=2.2'], extras_require={ - 'test': ['spec'] + 'test': ['spec==1.4.1'] }, cmdclass={'build_ext': BuildExt}, zip_safe=False, From cea0079a2f43b83c988105b7b840691efa3e7bfb Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 07:57:32 +0800 Subject: [PATCH 28/63] o --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f982770..f091113 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,6 @@ before_install: - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1 - g++ --version install: -- pip install -e . +- pip install ".[test]" script: - nosetests -c nose.cfg \ No newline at end of file From 981a9f70ac35b14581ede87ec8efb8390e0c9f0e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 08:06:58 +0800 Subject: [PATCH 29/63] o --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d409ffa..b146a00 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,11 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) +install_requires = ['pybind11>=2.2'] + +if sys.version_info[0] <3: + install_requires.append("pathlib2") + setup( name='cppjieba_py', version=__version__, @@ -110,7 +115,7 @@ def build_extensions(self): 'cppjieba.dict': ['*.utf8'] }, include_package_data=True, - install_requires=['pybind11>=2.2'], + install_requires=install_requires, extras_require={ 'test': ['spec==1.4.1'] }, From 8e420ed5240e8b2e1e272c066d4f7386c3844e69 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 08:17:06 +0800 Subject: [PATCH 30/63] o --- setup.py | 10 ++++++---- tests/test_jieba.py | 1 + tests/test_keyword_extractor.py | 1 + tests/test_textrank_extractor.py | 1 + tests/test_tokenizer.py | 1 + 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index b146a00..7c44ad7 100644 --- a/setup.py +++ b/setup.py @@ -99,8 +99,12 @@ def build_extensions(self): install_requires = ['pybind11>=2.2'] +extras_require = { + 'test': ['spec==1.4.1'] + } + if sys.version_info[0] <3: - install_requires.append("pathlib2") + extras_require["test"].append("pathlib") setup( name='cppjieba_py', @@ -116,9 +120,7 @@ def build_extensions(self): }, include_package_data=True, install_requires=install_requires, - extras_require={ - 'test': ['spec==1.4.1'] - }, + extras_require=extras_require, cmdclass={'build_ext': BuildExt}, zip_safe=False, ) diff --git a/tests/test_jieba.py b/tests/test_jieba.py index 37f774d..ba06e61 100644 --- a/tests/test_jieba.py +++ b/tests/test_jieba.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals from spec import Spec from pathlib import Path diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py index 6b1f632..6506178 100644 --- a/tests/test_keyword_extractor.py +++ b/tests/test_keyword_extractor.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec from pathlib import Path diff --git a/tests/test_textrank_extractor.py b/tests/test_textrank_extractor.py index 9c4b5aa..15e9600 100644 --- a/tests/test_textrank_extractor.py +++ b/tests/test_textrank_extractor.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec from pathlib import Path diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fc4fe58..eca34fb 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec from pathlib import Path From 655ff3e3e61b416ddff1ffd6d704d5d5757c4ece Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 08:30:15 +0800 Subject: [PATCH 31/63] o --- tests/test_jieba.py | 7 ++++++- tests/test_keyword_extractor.py | 7 ++++++- tests/test_textrank_extractor.py | 7 ++++++- tests/test_tokenizer.py | 7 ++++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/test_jieba.py b/tests/test_jieba.py index ba06e61..ae84747 100644 --- a/tests/test_jieba.py +++ b/tests/test_jieba.py @@ -1,6 +1,11 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals from spec import Spec -from pathlib import Path +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path DICT_DIR = Path("../cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py index 6506178..3e05037 100644 --- a/tests/test_keyword_extractor.py +++ b/tests/test_keyword_extractor.py @@ -1,7 +1,12 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec -from pathlib import Path +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path DICT_DIR = Path("../cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") diff --git a/tests/test_textrank_extractor.py b/tests/test_textrank_extractor.py index 15e9600..2949304 100644 --- a/tests/test_textrank_extractor.py +++ b/tests/test_textrank_extractor.py @@ -1,7 +1,12 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec -from pathlib import Path +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path DICT_DIR = Path("../cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index eca34fb..6c4ba82 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,7 +1,12 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals # pylint: disable=E1101 from spec import Spec -from pathlib import Path +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path DICT_DIR = Path("../cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") From 99e20e6fc64db3ed65a8c17f89662f98490d0ed0 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 08:35:07 +0800 Subject: [PATCH 32/63] o --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7c44ad7..746d873 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ def build_extensions(self): } if sys.version_info[0] <3: - extras_require["test"].append("pathlib") + extras_require["test"].append("pathlib2") setup( name='cppjieba_py', From a2b139cf3337e074c6ec20781616bfa3e4f631b8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 08:48:23 +0800 Subject: [PATCH 33/63] classifiers info [skip ci] --- setup.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/setup.py b/setup.py index 746d873..3945fea 100644 --- a/setup.py +++ b/setup.py @@ -106,6 +106,18 @@ def build_extensions(self): if sys.version_info[0] <3: extras_require["test"].append("pathlib2") +classifiers = [ + 'License :: OSI Approved :: MIT License', + 'Natural Language :: Chinese', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Operating System :: Unix', + 'Topic :: Text Processing :: Linguistic', + 'Topic :: Software Development :: Libraries :: Python Modules' +] + setup( name='cppjieba_py', version=__version__, @@ -113,6 +125,7 @@ def build_extensions(self): url='https://github.com/bung87/cppjieba-py/', description='python bindings of cppjieba', long_description= open("README.md").read(), + classifiers = classifiers, ext_modules=ext_modules, packages=['cppjieba_py','cppjieba.dict'], package_data = { From b735b07979dba6ddc1dc930f76febb3a522d6e0d Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 20:28:59 +0800 Subject: [PATCH 34/63] v0.0.8 --- README.md | 17 ++++++++--- cppjieba | 2 +- cppjieba_py/__init__.py | 2 +- setup.py | 2 +- src/main.cpp | 68 ++++++++++++++++++++++++++++++++--------- tests/test_tokenizer.py | 36 ++++++++++++++++++++-- 6 files changed, 102 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index fd10043..9f94d0a 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,7 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 ### 区别 * 原jieba的`.cut`族接口基本都由python的`iter()`函数包裹list结果来实现。 -* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。 -* `.add_word` 由于cppjieba没提供**freq**参数,实现上不一致。 +* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。 * `.del_word` 和`.suggest_freq` cppjieba没提供。 * `POSTokenizer.lcut` 在`Tokenizer.tag` 下, 唯一一个只提供了list返回类型的接口。 @@ -49,7 +48,7 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 # -*- coding: utf-8 -*- import cppjieba_py as jieba # or use defualt Tokenizer: jieba.cut -jieba_instance = Tokenizer("cppjieba/dict/jieba.dict.utf8") +jieba_instance = Tokenizer() seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 @@ -63,7 +62,11 @@ print(", ".join(seg_list)) ``` -## 性能 +for more: [example.py](example.py) , [tests](tests) + +## 性能 + +[performace_test/cppjieba.py](performace_test/cppjieba.py) 测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 @@ -71,4 +74,8 @@ print(", ".join(seg_list)) | ------------- |:-------------:| | cppjieba-py | 6.218346 | | jieba_fast | 31.315147 | -| jieba | 1:24.703040 | \ No newline at end of file +| jieba | 1:24.703040 | + +## Test + +`pip install ".[test]"` \ No newline at end of file diff --git a/cppjieba b/cppjieba index 1e1e585..7b2fdc4 160000 --- a/cppjieba +++ b/cppjieba @@ -1 +1 @@ -Subproject commit 1e1e585194d0816b369464890db7779436d5da21 +Subproject commit 7b2fdc41a235f332977ee2ca8c43715e7dc145e0 diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py index 013cc23..7122fc4 100644 --- a/cppjieba_py/__init__.py +++ b/cppjieba_py/__init__.py @@ -1,6 +1,6 @@ import libcppjieba -from libcppjieba import Tokenizer,add_word,tokenize,load_userdict +from libcppjieba import Tokenizer,add_word,tokenize,load_userdict,find,lookup_tag from libcppjieba import lcut,lcut_for_search,initialize from libcppjieba import cut_all as _cut_all,lcut_all diff --git a/setup.py b/setup.py index 3945fea..a3d4d12 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.7' +__version__ = '0.0.8' class get_pybind_include(object): """Helper class to determine the pybind11 include path diff --git a/src/main.cpp b/src/main.cpp index f6a1fb3..8d912df 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,10 +11,10 @@ using namespace std; namespace py = pybind11; -const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/jieba.dict.utf8"); -const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/hmm_model.utf8"); -const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/idf.utf8"); -const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/stop_words.utf8"); +const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/jieba.dict.utf8"); +const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/hmm_model.utf8"); +const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/idf.utf8"); +const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/stop_words.utf8"); using Word = cppjieba::Word; @@ -30,24 +30,24 @@ struct Tokenizer Tokenizer() { - jieba = new cppjieba::Jieba( DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + jieba = new cppjieba::Jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); }; Tokenizer(const string &main_dict) { - - jieba = new cppjieba::Jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); }; Tokenizer(const string &main_dict, const string &user_dict) { - jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH); + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH); }; Tokenizer(const string &main_dict, const string &user_dict, const string &stop_word_path) { - jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path); + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path); }; vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) @@ -156,6 +156,21 @@ struct Tokenizer { return jieba->InsertUserWord(word, tag); }; + + bool add_word(const string &word, int freq, const string &tag = cppjieba::UNKNOWN_TAG) + { + return jieba->InsertUserWord(word, freq, tag); + }; + + bool find(const string &word) + { + return jieba->Find(word); + }; + + string lookup_tag(const string &word) const + { + return jieba->LookupTag(word); + }; }; namespace Jieba @@ -330,6 +345,11 @@ bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) return dt->add_word(word, tag); }; +bool add_word(const string &word, int freq, const string &tag = cppjieba::UNKNOWN_TAG) +{ + return dt->add_word(word, freq, tag); +}; + vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) { init_check(); @@ -354,6 +374,18 @@ void load_userdict(const string &path) dt->load_userdict(path); }; +bool find(const string &word) +{ + init_check(); + return dt->find(word); +}; + +const string lookup_tag(const string &word) +{ + init_check(); + return dt->lookup_tag(word); +}; + }; // namespace Jieba PYBIND11_MODULE(libcppjieba, m) @@ -370,11 +402,14 @@ PYBIND11_MODULE(libcppjieba, m) m.def("initialize", &Jieba::initialize); m.def("get_default_keyword_extractor", &Jieba::get_default_keyword_extractor); m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); - m.def("add_word", &Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); + m.def("add_word", (bool (*)(const string &, const string &)) & Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); + m.def("add_word", (bool (*)(const string &, int freq, const string & )) & Jieba::add_word, py::arg("word"), py::arg("freq"), py::arg("tag") = cppjieba::UNKNOWN_TAG); m.def("tokenize", &Jieba::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); m.def("load_userdict", (void (*)(const vector &)) & Jieba::load_userdict2); m.def("load_userdict", (void (*)(const set &)) & Jieba::load_userdict3); m.def("load_userdict", (void (*)(const string &)) & Jieba::load_userdict); + m.def("find", &Jieba::find); + m.def("lookup_tag", &Jieba::lookup_tag); py::class_(m, "KeywordExtractor") .def(py::init()) @@ -389,9 +424,9 @@ PYBIND11_MODULE(libcppjieba, m) py::class_(m, "Tokenizer") .def(py::init<>()) - .def(py::init()) - .def(py::init()) - .def(py::init()) + .def(py::init()) + .def(py::init()) + .def(py::init()) .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) .def("lcut_all", &Tokenizer::lcut_all) @@ -399,11 +434,14 @@ PYBIND11_MODULE(libcppjieba, m) .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) .def("tag", &Tokenizer::tag, py::arg("sentence")) - .def("add_word", &Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) + .def("add_word", (bool (Tokenizer::*)(const string &, const string & )) & Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) + .def("add_word", (bool (Tokenizer::*)(const string &, int freq, const string & )) & Tokenizer::add_word, py::arg("word"), py::arg("freq"), py::arg("tag") = cppjieba::UNKNOWN_TAG) .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true) .def("load_userdict", (void (Tokenizer::*)(const vector &)) & Tokenizer::load_userdict) .def("load_userdict", (void (Tokenizer::*)(const string &)) & Tokenizer::load_userdict) - .def("load_userdict", (void (Tokenizer::*)(const set &)) & Tokenizer::load_userdict); + .def("load_userdict", (void (Tokenizer::*)(const set &)) & Tokenizer::load_userdict) + .def("find", &Tokenizer::find) + .def("lookup_tag", &Tokenizer::lookup_tag); // py::class_(m, "Word") // .def_readonly("word", &Word::word) // .def("__str__", [](const Word &v) { diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 6c4ba82..f19f187 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -3,7 +3,7 @@ # pylint: disable=E1101 from spec import Spec import sys -if sys.version_info[0] >=3: +if sys.version_info[0] >= 3: from pathlib import Path else: from pathlib2 import Path @@ -20,8 +20,9 @@ class TokenizerTest(Spec): @classmethod def setUpClass(cls): cls.dt = Tokenizer(DICT) + cls.dt.add_word("区块链", 10, "nz") - class init: + class init_0: "__init__" def takes_arg1_as_main_dict_path(self): @@ -74,3 +75,34 @@ def accept_list_as_arg(self): def accept_set_as_arg(self): self.dt.load_userdict(set([])) + + class add_word: + def takes_arg1_as_word(self): + self.dt.add_word("区块链") + + def takes_arg2_as_freq(self): + self.dt.add_word("区块链", 10) + + def takes_arg3_as_tag(self): + pass + + class find: + def takes_arg1_as_word(self): + self.dt.find("区块链") + + def can_find_added_word(self): + r = self.dt.find("区块链") + assert r == True + + class lookup_tag: + def takes_arg1_as_word(self): + self.dt.lookup_tag("区块链") + + def can_find_added_word(self): + self.dt.add_word("区块链", 10, "nz") # because of random test order + # from nose.plugins.skip import Skip + r = self.dt.lookup_tag("区块链") + # try: + assert r == "nz" + # except AssertionError: + # raise Skip() From 1a74c872d7f8d722cf536368b58f4ac0d1c13d2f Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 9 Jun 2018 20:38:07 +0800 Subject: [PATCH 35/63] update classifiers --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a3d4d12..9c70d25 100644 --- a/setup.py +++ b/setup.py @@ -108,11 +108,13 @@ def build_extensions(self): classifiers = [ 'License :: OSI Approved :: MIT License', - 'Natural Language :: Chinese', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: C++', 'Operating System :: Unix', 'Topic :: Text Processing :: Linguistic', 'Topic :: Software Development :: Libraries :: Python Modules' From a78e91dc0cced1fc257ff1779e6e9286f0baa6cb Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 10 Jun 2018 19:11:22 +0800 Subject: [PATCH 36/63] fix cut_all condition --- src/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 8d912df..d26a5e9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -91,7 +91,7 @@ struct Tokenizer WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) { WordVector words; - if (cut_all) + if (!cut_all) { jieba->Cut(sentence, words, HMM); } @@ -105,7 +105,7 @@ struct Tokenizer vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) { vector words; - if (cut_all) + if (!cut_all) { jieba->Cut(sentence, words, HMM); } From ed326b60ff92ea0b2e5ea0c7908c03128e543dc6 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 10 Jun 2018 19:12:04 +0800 Subject: [PATCH 37/63] fix cut_all condition --- performace_test/consistency.py | 123 ++++++++++++++++++++++ performace_test/{cppjieba.py => speed.py} | 0 setup.py | 2 +- 3 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 performace_test/consistency.py rename performace_test/{cppjieba.py => speed.py} (100%) diff --git a/performace_test/consistency.py b/performace_test/consistency.py new file mode 100644 index 0000000..a5b30b6 --- /dev/null +++ b/performace_test/consistency.py @@ -0,0 +1,123 @@ +#encoding=utf-8 +import sys,os +import jieba +import cppjieba_py +# wget https://raw.githubusercontent.com/fxsjy/jieba/master/jieba/dict.txt -O performace_test/dict.txt + +from distutils.sysconfig import get_python_lib +site_package_dir = get_python_lib() +jieba_dict = os.path.join(site_package_dir,"jieba","dict.txt") +tokenizer = cppjieba_py.Tokenizer(jieba_dict) + +def cuttest(test_sent): + result = jieba.lcut(test_sent) + # result2 = cppjieba_py.lcut(test_sent) + result2 = tokenizer.lcut(test_sent) + print(result) + print(result2) + assert result == result2 + + +if __name__ == "__main__": + cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") + cuttest("我不喜欢日本和服。") + cuttest("雷猴回归人间。") + cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") + cuttest("我需要廉租房") + cuttest("永和服装饰品有限公司") + cuttest("我爱北京天安门") + cuttest("abc") + cuttest("隐马尔可夫") + cuttest("雷猴是个好网站") + # cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成") + # '一词' / '一', '词' + cuttest("草泥马和欺实马是今年的流行词汇") + cuttest("伊藤洋华堂总府店") + cuttest("中国科学院计算技术研究所") + cuttest("罗密欧与朱丽叶") + cuttest("我购买了道具和服装") + cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍") + cuttest("湖北省石首市") + cuttest("湖北省十堰市") + cuttest("总经理完成了这件事情") + cuttest("电脑修好了") + cuttest("做好了这件事情就一了百了了") + cuttest("人们审美的观点是不同的") + cuttest("我们买了一个美的空调") + cuttest("线程初始化时我们要注意") + cuttest("一个分子是由好多原子组织成的") + cuttest("祝你马到功成") + cuttest("他掉进了无底洞里") + cuttest("中国的首都是北京") + cuttest("孙君意") + cuttest("外交部发言人马朝旭") + cuttest("领导人会议和第四届东亚峰会") + cuttest("在过去的这五年") + cuttest("还需要很长的路要走") + cuttest("60周年首都阅兵") + cuttest("你好人们审美的观点是不同的") + cuttest("买水果然后来世博园") + cuttest("买水果然后去世博园") + cuttest("但是后来我才知道你是对的") + cuttest("存在即合理") + cuttest("的的的的的在的的的的就以和和和") + cuttest("I love你,不以为耻,反以为rong") + cuttest("因") + cuttest("") + cuttest("hello你好人们审美的观点是不同的") + cuttest("很好但主要是基于网页形式") + cuttest("hello你好人们审美的观点是不同的") + cuttest("为什么我不能拥有想要的生活") + cuttest("后来我才") + cuttest("此次来中国是为了") + cuttest("使用了它就可以解决一些问题") + cuttest(",使用了它就可以解决一些问题") + cuttest("其实使用了它就可以解决一些问题") + cuttest("好人使用了它就可以解决一些问题") + cuttest("是因为和国家") + cuttest("老年搜索还支持") + cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ") + cuttest("大") + cuttest("") + cuttest("他说的确实在理") + cuttest("长春市长春节讲话") + cuttest("结婚的和尚未结婚的") + cuttest("结合成分子时") + cuttest("旅游和服务是最好的") + cuttest("这件事情的确是我的错") + cuttest("供大家参考指正") + cuttest("哈尔滨政府公布塌桥原因") + cuttest("我在机场入口处") + cuttest("邢永臣摄影报道") + cuttest("BP神经网络如何训练才能在分类时增加区分度?") + cuttest("南京市长江大桥") + cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究") + cuttest('长春市长春药店') + cuttest('邓颖超生前最喜欢的衣服') + cuttest('胡锦涛是热爱世界和平的政治局常委') + cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪') + cuttest('一次性交多少钱') + cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') + cuttest('小和尚留了一个像大和尚一样的和尚头') + cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') + cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') + # cuttest('AT&T是一件不错的公司,给你发offer了吗?') + # '了', '吗' / '了吗' + # cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') + # '是', '吗' / '是吗' + # cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') + # '他开' / '他', '开' + # cuttest('枪杆子中出政权') + # '中' / '中出' + cuttest('张三风同学走上了不归路') + # cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。') + # '阿Q' /'阿', 'Q' + # 'BB机' / 'BB', '机' + # 'AA制' / 'AA', '制' + # cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。') + # '1号店' / '1', '号店' + # '小S', '和', '大S', / '小', 'S', '和', '大', 'S' + # '3D' / '3', 'D' + # jieba.del_word('很赞') + cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?') + # '5%' / '5', '%' \ No newline at end of file diff --git a/performace_test/cppjieba.py b/performace_test/speed.py similarity index 100% rename from performace_test/cppjieba.py rename to performace_test/speed.py diff --git a/setup.py b/setup.py index 9c70d25..22de878 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.8' +__version__ = '0.0.9' class get_pybind_include(object): """Helper class to determine the pybind11 include path From 5718476df3adaabe77c71463fd4fcbcc08e33839 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 11 Jun 2018 15:49:43 +0800 Subject: [PATCH 38/63] o --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9f94d0a..117f25c 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,13 @@ for more: [example.py](example.py) , [tests](tests) ## 性能 -[performace_test/cppjieba.py](performace_test/cppjieba.py) +[performace_test/speed.py](performace_test/speed.py) 测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 | 方案 | 速度 | | ------------- |:-------------:| -| cppjieba-py | 6.218346 | +| cppjieba-py | 9.312873 | | jieba_fast | 31.315147 | | jieba | 1:24.703040 | From 3e3727d5b5cd0b56258331d3650b79c2dee75c0c Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 18 Aug 2018 16:10:36 +0800 Subject: [PATCH 39/63] add posseg.cut speed test,jieba wins --- README.md | 7 +++++++ performace_test/speed.py | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 117f25c..07591f4 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,13 @@ for more: [example.py](example.py) , [tests](tests) | jieba_fast | 31.315147 | | jieba | 1:24.703040 | +`posseg.cut` + +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 11.860994 | +| jieba | 0.048153 | + ## Test `pip install ".[test]"` \ No newline at end of file diff --git a/performace_test/speed.py b/performace_test/speed.py index 7477ca8..aefae4b 100644 --- a/performace_test/speed.py +++ b/performace_test/speed.py @@ -7,25 +7,32 @@ #wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8 if __name__ == "__main__": + if sys.argv[1] == "cppjieba_py":# 0:00:03.861202 import cppjieba_py as jieba + import cppjieba_py.posseg as pseg # 0:00:11.860994 elif sys.argv[1] == "jieba": # 0:01:24.703040 import jieba + import jieba.posseg as pseg # 0:00:00.048153 elif sys.argv[1] == "jieba_fast": import jieba_fast as jieba + import jieba_fast.posseg as pseg + + if len(sys.argv) == 3 and sys.argv[2] =="pseg": + method = pseg.cut + else: + method = jieba.cut lines = [] weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") for line in open(weicheng): - lines.append(line.strip()); - - - result = [""] * 10; - result[random.randint(0, 9)] = '/'.join(jieba.cut("南京长江大桥")) + lines.append(line.strip()) + result = [""] * 10 + result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥"))) starttime = datetime.datetime.now() for i in range(50): for line in lines: - r = '/'.join(jieba.cut(line)) + r = '/'.join(str(method(line))) # print(r) result[random.randint(0, 9)] = r #result[random.randint(0, 9)] = jieba.cut(line) From 390249256bc3e3b4762f5d72f5cab6df2c6c1be5 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 18 Aug 2018 16:53:10 +0800 Subject: [PATCH 40/63] posseg api compatibility HMM will be ignored in fact --- cppjieba_py/posseg.py | 5 +++-- performace_test/speed.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cppjieba_py/posseg.py b/cppjieba_py/posseg.py index e8f3dd8..1a10dc6 100644 --- a/cppjieba_py/posseg.py +++ b/cppjieba_py/posseg.py @@ -1,8 +1,9 @@ import libcppjieba -def cut(sentence): +def cut(sentence,HMM=False): it = libcppjieba.tag(sentence) return iter(it) -lcut = libcppjieba.tag \ No newline at end of file +def lcut(sentence,HMM=False): + return libcppjieba.tag(sentence) \ No newline at end of file diff --git a/performace_test/speed.py b/performace_test/speed.py index aefae4b..6644b79 100644 --- a/performace_test/speed.py +++ b/performace_test/speed.py @@ -27,12 +27,12 @@ for line in open(weicheng): lines.append(line.strip()) result = [""] * 10 - result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥"))) + result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥",HMM=False))) starttime = datetime.datetime.now() for i in range(50): for line in lines: - r = '/'.join(str(method(line))) + r = '/'.join(str(method(line,HMM=False))) # print(r) result[random.randint(0, 9)] = r #result[random.randint(0, 9)] = jieba.cut(line) From 3095fc2d84c212f35f46167b92caa337aca8c315 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 18 Aug 2018 16:57:41 +0800 Subject: [PATCH 41/63] release 0.0.10 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 22de878..47e818c 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.9' +__version__ = '0.0.10' class get_pybind_include(object): """Helper class to determine the pybind11 include path From e89aa3b0fff7d8dbdde5c589b9ea5686c604063e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sat, 18 Aug 2018 17:52:51 +0800 Subject: [PATCH 42/63] use lcut actually perform cut for speed test --- README.md | 2 +- performace_test/speed.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 07591f4..a7aab6e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ for more: [example.py](example.py) , [tests](tests) | 方案 | 速度 | | ------------- |:-------------:| | cppjieba-py | 11.860994 | -| jieba | 0.048153 | +| jieba | 1:19.411649 | ## Test diff --git a/performace_test/speed.py b/performace_test/speed.py index 6644b79..183d523 100644 --- a/performace_test/speed.py +++ b/performace_test/speed.py @@ -19,9 +19,9 @@ import jieba_fast.posseg as pseg if len(sys.argv) == 3 and sys.argv[2] =="pseg": - method = pseg.cut + method = pseg.lcut else: - method = jieba.cut + method = jieba.lcut lines = [] weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") for line in open(weicheng): From 454a93decbc0a41123f0198b0ac987ec6572e3ae Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 19 Aug 2018 01:01:32 +0800 Subject: [PATCH 43/63] improve performance comparation --- README.md | 18 +++++++++++++----- performace_test/speed.py | 9 ++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a7aab6e..2c70c7a 100644 --- a/README.md +++ b/README.md @@ -70,17 +70,25 @@ for more: [example.py](example.py) , [tests](tests) 测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 +`lcut HMM=False` | 方案 | 速度 | | ------------- |:-------------:| -| cppjieba-py | 9.312873 | -| jieba_fast | 31.315147 | -| jieba | 1:24.703040 | +| cppjieba-py | 10.642102 | +| jieba-fast==0.51 | 26.129298 | +| jieba==0.39 | 50.623866 | -`posseg.cut` +`lcut HMM=True` +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 13.139232 | +| jieba-fast==0.51 | 34.574907 | +| jieba==0.39 | 1:26.756226 | + +`posseg.lcut` | 方案 | 速度 | | ------------- |:-------------:| -| cppjieba-py | 11.860994 | +| cppjieba-py | 20.382905 | | jieba | 1:19.411649 | ## Test diff --git a/performace_test/speed.py b/performace_test/speed.py index 183d523..6c6bb64 100644 --- a/performace_test/speed.py +++ b/performace_test/speed.py @@ -18,21 +18,24 @@ import jieba_fast as jieba import jieba_fast.posseg as pseg - if len(sys.argv) == 3 and sys.argv[2] =="pseg": + if len(sys.argv) == 4 and sys.argv[3] =="pseg": method = pseg.lcut else: method = jieba.lcut + HMM = False + if len(sys.argv) >= 3 and sys.argv[2] =="hmm": + HMM = True lines = [] weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") for line in open(weicheng): lines.append(line.strip()) result = [""] * 10 - result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥",HMM=False))) + result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥",HMM=HMM))) starttime = datetime.datetime.now() for i in range(50): for line in lines: - r = '/'.join(str(method(line,HMM=False))) + r = '/'.join(str(method(line,HMM=HMM))) # print(r) result[random.randint(0, 9)] = r #result[random.randint(0, 9)] = jieba.cut(line) From 148db6ba0660f7e8be56fa1b0cc985e916eeea59 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 19 Aug 2018 01:03:18 +0800 Subject: [PATCH 44/63] o --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2c70c7a..eba902c 100644 --- a/README.md +++ b/README.md @@ -70,21 +70,23 @@ for more: [example.py](example.py) , [tests](tests) 测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 -`lcut HMM=False` +`lcut HMM=False` + | 方案 | 速度 | | ------------- |:-------------:| | cppjieba-py | 10.642102 | | jieba-fast==0.51 | 26.129298 | | jieba==0.39 | 50.623866 | -`lcut HMM=True` +`lcut HMM=True` + | 方案 | 速度 | | ------------- |:-------------:| | cppjieba-py | 13.139232 | | jieba-fast==0.51 | 34.574907 | | jieba==0.39 | 1:26.756226 | -`posseg.lcut` +`posseg.lcut` | 方案 | 速度 | | ------------- |:-------------:| From f506c2f96e926c8af6ca854a4294012ed8f5d3f8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 19 Aug 2018 01:04:43 +0800 Subject: [PATCH 45/63] o --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index eba902c..0e8f741 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,9 @@ for more: [example.py](example.py) , [tests](tests) | 方案 | 速度 | | ------------- |:-------------:| | cppjieba-py | 20.382905 | -| jieba | 1:19.411649 | +| jieba==0.39 | 1:19.411649 | ## Test -`pip install ".[test]"` \ No newline at end of file +`pip install ".[test]"` +`nosetests -c nose.cfg` \ No newline at end of file From 0741718a238df93f7504c3f64498675ad483d1fb Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 19 Aug 2018 12:15:55 +0800 Subject: [PATCH 46/63] update consistency test commented code will fails whether using hmm --- performace_test/consistency.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/performace_test/consistency.py b/performace_test/consistency.py index a5b30b6..322a47e 100644 --- a/performace_test/consistency.py +++ b/performace_test/consistency.py @@ -8,18 +8,21 @@ site_package_dir = get_python_lib() jieba_dict = os.path.join(site_package_dir,"jieba","dict.txt") tokenizer = cppjieba_py.Tokenizer(jieba_dict) +HMM = False +if "HMM" in os.environ: + HMM = True def cuttest(test_sent): - result = jieba.lcut(test_sent) + result = jieba.lcut(test_sent,HMM=HMM) # result2 = cppjieba_py.lcut(test_sent) - result2 = tokenizer.lcut(test_sent) + result2 = tokenizer.lcut(test_sent,HMM=HMM) print(result) print(result2) assert result == result2 if __name__ == "__main__": - cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") + # cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") cuttest("我不喜欢日本和服。") cuttest("雷猴回归人间。") cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") @@ -100,7 +103,7 @@ def cuttest(test_sent): cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') cuttest('小和尚留了一个像大和尚一样的和尚头') cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') - cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') + # cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') # cuttest('AT&T是一件不错的公司,给你发offer了吗?') # '了', '吗' / '了吗' # cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') From 6d4bf8ac423deeb762098b65672bad1cb1f024cb Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 17 Mar 2019 07:51:09 +0800 Subject: [PATCH 47/63] open readme as utf8 in case user platform default encoding is not utf8 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 47e818c..051e17f 100644 --- a/setup.py +++ b/setup.py @@ -126,7 +126,7 @@ def build_extensions(self): author='bung87,yeping zheng', url='https://github.com/bung87/cppjieba-py/', description='python bindings of cppjieba', - long_description= open("README.md").read(), + long_description= open("README.md",encoding="utf8").read(), classifiers = classifiers, ext_modules=ext_modules, packages=['cppjieba_py','cppjieba.dict'], From 042859417b26646f11cb90a3249bc5229d0ecea8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Sun, 17 Mar 2019 07:58:57 +0800 Subject: [PATCH 48/63] open readme as utf8 in case user platform default encoding is not utf8 --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 051e17f..1a46662 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,12 @@ import sys import setuptools import os +import io from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.10' +__version__ = '0.0.11' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -126,7 +127,7 @@ def build_extensions(self): author='bung87,yeping zheng', url='https://github.com/bung87/cppjieba-py/', description='python bindings of cppjieba', - long_description= open("README.md",encoding="utf8").read(), + long_description= io.open("README.md",'r', encoding="utf-8").read(), classifiers = classifiers, ext_modules=ext_modules, packages=['cppjieba_py','cppjieba.dict'], From 22bd46543883744d341a4ab313da81b0b720d667 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 19:06:37 +0800 Subject: [PATCH 49/63] readme travis pypi --- .travis.yml | 9 ++++++++- README.md | 2 +- cppjieba | 2 +- setup.py | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index f091113..a3316ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,4 +15,11 @@ before_install: install: - pip install ".[test]" script: -- nosetests -c nose.cfg \ No newline at end of file +- nosetests -c nose.cfg +deploy: + provider: pypi + distributions: sdist bdist_wheel + user: __token__ + password: + secure: fzIztIC2P0xWqEUHSYelFcnbC7aRV6OCgTtLExafvWRLl6W555X3iHUXI6KsVN7YNZOedXl0TqhFrlbCXky/Ub9ReoV6/htd3xsubQUXWWALwA3bynAH46AQIe21UH6dhC62LNMbjHVnrEAC4w9GVlsigqIYESavcXCypWwk+bziiYpTFpkTxnSAFeelL2PzOEzvjlMvIu7lN15+ODuk/HmAPKO2FTTqsr2B8xkYQpC09vK482hSVblFJqokSPQmxhRMgZ+Q03zNQNnsvgZY2J8KYqsRSH/A1JRwxNorefsM5yTfY1sORgDD1MpwcxPhF5FGLJTzNJ6jXBW5l/uUUjLXaEa4ohNA9xWoQ+QKwGkCjPA4N3F9zaBTyS7vK07pBxPXN2RyBfmUS4DkpbEyGj/29lq8Ixe3q5LneAItlzhLzhSxtcQmqTA1dyutetbn9kxg/u6J7TYCw4UlCCoOKKKBln9kifoa7cNwJoPHfXqnY1roqppSmvgIwHJf8wdtJlfGYSW6jNEUKQPgn41kEK6shO5ue3Sxe+qk0kM0d7DUpa4ZI+6vGIx6A187Xj2x5NlLii6zunJweEK/ifxOOgdwTtVl/kgPr89Nzely1Qipx15D4p2q4r1A7Mk//LXx3gWDcVGjQA0pB49q+kASonaD0eSD184Im+I94YhlQSI= + skip_existing: true diff --git a/README.md b/README.md index 0e8f741..e1cef39 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cppjieba-py -[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) +[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) [![PyPI](https://img.shields.io/pypi/v/cppjieba-py.svg)](https://pypi.python.org/pypi/cppjieba-py) cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 diff --git a/cppjieba b/cppjieba index 7b2fdc4..0d39528 160000 --- a/cppjieba +++ b/cppjieba @@ -1 +1 @@ -Subproject commit 7b2fdc41a235f332977ee2ca8c43715e7dc145e0 +Subproject commit 0d39528486674a89a86b170a76edc35b888dd30e diff --git a/setup.py b/setup.py index 1a46662..26c6020 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from distutils.sysconfig import get_python_lib site_package_dir = get_python_lib() + os.path.sep -__version__ = '0.0.11' +__version__ = '0.0.12' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -128,6 +128,7 @@ def build_extensions(self): url='https://github.com/bung87/cppjieba-py/', description='python bindings of cppjieba', long_description= io.open("README.md",'r', encoding="utf-8").read(), + long_description_content_type='text/markdown', classifiers = classifiers, ext_modules=ext_modules, packages=['cppjieba_py','cppjieba.dict'], From 50843a44c25efd668285e82f8c57ae37d4d4be29 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 19:22:53 +0800 Subject: [PATCH 50/63] pybind11==2.2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 26c6020..8274d4c 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) -install_requires = ['pybind11>=2.2'] +install_requires = ['pybind11==2.2'] extras_require = { 'test': ['spec==1.4.1'] From b1d2a5abd55d042144d8f1a5902ec1b52f683cbf Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 19:29:31 +0800 Subject: [PATCH 51/63] o --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8274d4c..daea9e6 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) -install_requires = ['pybind11==2.2'] +install_requires = ['pybind11==2.3.0'] extras_require = { 'test': ['spec==1.4.1'] From 775d940ee335af989be3213c4e0b4d87a6b7a2cb Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 19:48:24 +0800 Subject: [PATCH 52/63] ignore submodules --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index a3316ba..7c04016 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,8 @@ python: - '3.5' - '3.6' sudo: false +git: + submodules: false before_install: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get -y update From 70119a8cf6f569bf72c09396f43d35901c1672ea Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 19:59:39 +0800 Subject: [PATCH 53/63] o --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7c04016..d3c63dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,8 +5,6 @@ python: - '3.5' - '3.6' sudo: false -git: - submodules: false before_install: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get -y update @@ -14,6 +12,7 @@ before_install: - sudo apt-get -y install g++-5 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1 - g++ --version +- git submodule update --init cppjieba install: - pip install ".[test]" script: From 3d1a553467a1e314f32ea4dfccf3258fbd420d41 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 20:05:34 +0800 Subject: [PATCH 54/63] o --- .gitmodules | 1 + .travis.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index eff9268..ad52ecf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,5 +1,6 @@ [submodule "pybind11"] path = pybind11 + active = false url = https://github.com/pybind/pybind11.git [submodule "cppjieba"] path = cppjieba diff --git a/.travis.yml b/.travis.yml index d3c63dc..1fa00d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,8 @@ python: - '3.5' - '3.6' sudo: false +git: + submodules: false before_install: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get -y update From b82193900f5af5b07a600b096148395c26e9ada8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 20:25:09 +0800 Subject: [PATCH 55/63] o --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index daea9e6..e63711a 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,11 @@ class get_pybind_include(object): method can be invoked. """ def __init__(self, user=False): + try: + import pybind11 + except ImportError: + if subprocess.call([sys.executable, '-m', 'pip', 'install', 'pybind11']): + raise RuntimeError('pybind11 install failed.') self.user = user def __str__(self): @@ -98,7 +103,7 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) -install_requires = ['pybind11==2.3.0'] +install_requires = ['pybind11>=2.2.0'] extras_require = { 'test': ['spec==1.4.1'] From e3a89f1401fe65afe84f264011a5757cbc121b7a Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Mon, 4 Nov 2019 20:30:36 +0800 Subject: [PATCH 56/63] o --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e63711a..9d3308c 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def build_extensions(self): ext.extra_compile_args = opts build_ext.build_extensions(self) -install_requires = ['pybind11>=2.2.0'] +install_requires = ['pybind11>=2.2.0',"setuptools >= 0.7.0"] extras_require = { 'test': ['spec==1.4.1'] From dc7bfe5311eec584ff2b4d269078e777f3e471f2 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Tue, 5 Nov 2019 22:59:02 +0800 Subject: [PATCH 57/63] v0.0.13 --- .gitmodules | 17 ++++++++++------- .travis.yml | 2 +- MANIFEST.in | 8 ++++---- README.md | 4 +++- cppjieba | 1 - cppjieba_py/cppjieba | 1 + {src => cppjieba_py/src}/main.cpp | 8 ++++---- setup.py | 28 ++++++++++++++-------------- tests/test_jieba.py | 2 +- tests/test_keyword_extractor.py | 2 +- tests/test_textrank_extractor.py | 2 +- tests/test_tokenizer.py | 2 +- 12 files changed, 41 insertions(+), 36 deletions(-) delete mode 160000 cppjieba create mode 160000 cppjieba_py/cppjieba rename {src => cppjieba_py/src}/main.cpp (98%) diff --git a/.gitmodules b/.gitmodules index ad52ecf..5a48eaa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,10 @@ -[submodule "pybind11"] - path = pybind11 - active = false - url = https://github.com/pybind/pybind11.git -[submodule "cppjieba"] - path = cppjieba - url = https://github.com/bung87/cppjieba.git +# [submodule "pybind11"] +# path = pybind11 +# active = false +# url = https://github.com/pybind/pybind11.git +# [submodule "cppjieba"] +# path = cppjieba +# url = https://github.com/yanyiwu/cppjieba.git +[submodule "cppjieba_py/cppjieba"] + path = cppjieba_py/cppjieba + url = https://github.com/yanyiwu/cppjieba.git diff --git a/.travis.yml b/.travis.yml index 1fa00d1..3fd77aa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - sudo apt-get -y install g++-5 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1 - g++ --version -- git submodule update --init cppjieba +- git submodule update --init cppjieba_py/cppjieba install: - pip install ".[test]" script: diff --git a/MANIFEST.in b/MANIFEST.in index 30317ad..2042708 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include README.md include setup.py - -recursive-include cppjieba/include/cppjieba *.hpp -recursive-include cppjieba/dict *.utf8 -recursive-include cppjieba/deps *.hpp +recursive-include cppjieba_py/src *.cpp +recursive-include cppjieba_py/cppjieba/include/cppjieba *.hpp +recursive-include cppjieba_py/cppjieba/dict *.utf8 +recursive-include cppjieba_py/cppjieba/deps *.hpp diff --git a/README.md b/README.md index e1cef39..77a1f50 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,9 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 ``` $ git clone --recursive https://github.com/bung87/cppjieba-py - $ python setup.py install + $ pip install . # or + $ python setup.py install --old-and-unmanageable + without argument will install under egg dir,which cause libcppjieba found wrong default dictionaries directory ``` diff --git a/cppjieba b/cppjieba deleted file mode 160000 index 0d39528..0000000 --- a/cppjieba +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0d39528486674a89a86b170a76edc35b888dd30e diff --git a/cppjieba_py/cppjieba b/cppjieba_py/cppjieba new file mode 160000 index 0000000..79ffd00 --- /dev/null +++ b/cppjieba_py/cppjieba @@ -0,0 +1 @@ +Subproject commit 79ffd0097906bfaaa0fa8e5ce23f1a1d70ac5a81 diff --git a/src/main.cpp b/cppjieba_py/src/main.cpp similarity index 98% rename from src/main.cpp rename to cppjieba_py/src/main.cpp index d26a5e9..922ca34 100644 --- a/src/main.cpp +++ b/cppjieba_py/src/main.cpp @@ -11,10 +11,10 @@ using namespace std; namespace py = pybind11; -const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/jieba.dict.utf8"); -const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/hmm_model.utf8"); -const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/idf.utf8"); -const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba/dict/stop_words.utf8"); +const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/jieba.dict.utf8"); +const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/hmm_model.utf8"); +const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/idf.utf8"); +const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/stop_words.utf8"); using Word = cppjieba::Word; diff --git a/setup.py b/setup.py index 9d3308c..cc2b9de 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,12 @@ import setuptools import os import io - +import subprocess +from os import path from distutils.sysconfig import get_python_lib -site_package_dir = get_python_lib() + os.path.sep +site_package_dir = get_python_lib() + path.sep -__version__ = '0.0.12' +__version__ = '0.0.13' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -31,13 +32,16 @@ def __str__(self): ext_modules = [ Extension( 'libcppjieba', - ['src/main.cpp'], + # ['src/main.cpp'], + ["cppjieba_py/src/main.cpp"], include_dirs=[ # Path to pybind11 headers get_pybind_include(), get_pybind_include(user=True), - "cppjieba/include", - "cppjieba/deps" + # path.join(site_package_dir,"cppjieba",'include'), + # path.join(site_package_dir,"cppjieba",'deps') + "cppjieba_py/cppjieba/include", + "cppjieba_py/cppjieba/deps" ], language='c++' ), @@ -94,11 +98,10 @@ def build_extensions(self): opts.append(cpp_flag(self.compiler)) if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') - elif ct == 'msvc': + if ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - opts.append('/DSITE_PACKAGE_PATH=\\"%s\\"' % - site_package_dir) + opts.append('/DSITE_PACKAGE_PATH=\\"%s\\"' % site_package_dir) for ext in self.extensions: ext.extra_compile_args = opts build_ext.build_extensions(self) @@ -106,7 +109,7 @@ def build_extensions(self): install_requires = ['pybind11>=2.2.0',"setuptools >= 0.7.0"] extras_require = { - 'test': ['spec==1.4.1'] + 'test': ['spec>=1.4.1','nose>=1.3.7'] } if sys.version_info[0] <3: @@ -136,10 +139,7 @@ def build_extensions(self): long_description_content_type='text/markdown', classifiers = classifiers, ext_modules=ext_modules, - packages=['cppjieba_py','cppjieba.dict'], - package_data = { - 'cppjieba.dict': ['*.utf8'] - }, + packages=['cppjieba_py'], include_package_data=True, install_requires=install_requires, extras_require=extras_require, diff --git a/tests/test_jieba.py b/tests/test_jieba.py index ae84747..f6d1623 100644 --- a/tests/test_jieba.py +++ b/tests/test_jieba.py @@ -7,7 +7,7 @@ else: from pathlib2 import Path -DICT_DIR = Path("../cppjieba/dict") +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") USER_DICT = str(DICT_DIR / "user.dict.utf8") STOP_WORD = str(DICT_DIR / "stop_words.utf8") diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py index 3e05037..b46cab4 100644 --- a/tests/test_keyword_extractor.py +++ b/tests/test_keyword_extractor.py @@ -8,7 +8,7 @@ else: from pathlib2 import Path -DICT_DIR = Path("../cppjieba/dict") +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") IDF = str(DICT_DIR / "idf.utf8") STOP_WORD = str(DICT_DIR / "stop_words.utf8") diff --git a/tests/test_textrank_extractor.py b/tests/test_textrank_extractor.py index 2949304..e033ff8 100644 --- a/tests/test_textrank_extractor.py +++ b/tests/test_textrank_extractor.py @@ -8,7 +8,7 @@ else: from pathlib2 import Path -DICT_DIR = Path("../cppjieba/dict") +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") IDF = str(DICT_DIR / "idf.utf8") STOP_WORD = str(DICT_DIR / "stop_words.utf8") diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index f19f187..0c68311 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -8,7 +8,7 @@ else: from pathlib2 import Path -DICT_DIR = Path("../cppjieba/dict") +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") DICT = str(DICT_DIR / "jieba.dict.utf8") USER_DICT = str(DICT_DIR / "user.dict.utf8") STOP_WORD = str(DICT_DIR / "stop_words.utf8") From bc0e457d1ecfd12588382cf5d436b4e6c2c89610 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 07:40:46 +0800 Subject: [PATCH 58/63] o --- .gitignore | 1 + .travis.yml | 26 ++++++++++++++++++++++++++ pybind11 | 1 - travis/build-wheels.sh | 22 ++++++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) delete mode 160000 pybind11 create mode 100644 travis/build-wheels.sh diff --git a/.gitignore b/.gitignore index 9546843..5f5f4d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +performace_test/* # Byte-compiled / optimized / DLL files .DS_Store __pycache__/ diff --git a/.travis.yml b/.travis.yml index 3fd77aa..21117e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,25 @@ +notifications: + email: false + +matrix: + include: + - sudo: required + services: + - docker + env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 + PLAT=manylinux1_x86_64 + - sudo: required + services: + - docker + env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 + PRE_CMD=linux32 + PLAT=manylinux1_i686 + - sudo: required + services: + - docker + env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 + PLAT=manylinux2010_x86_64 + language: python python: - '2.7' @@ -19,7 +41,11 @@ install: - pip install ".[test]" script: - nosetests -c nose.cfg +before_deploy: +- docker run --rm -e PLAT=$PLAT -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh deploy: + # on: + # tags: true provider: pypi distributions: sdist bdist_wheel user: __token__ diff --git a/pybind11 b/pybind11 deleted file mode 160000 index 55dc131..0000000 --- a/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 55dc131944c764ba7e30085b971a9d70531114b3 diff --git a/travis/build-wheels.sh b/travis/build-wheels.sh new file mode 100644 index 0000000..ed00408 --- /dev/null +++ b/travis/build-wheels.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e -x + +# Install a system package required by our library +# yum install -y atlas-devel + +# Compile wheels +for PYBIN in /opt/python/*/bin; do + # "${PYBIN}/pip" install -r /io/dev-requirements.txt + "${PYBIN}/pip" wheel /io/ -w wheelhouse/ +done + +# Bundle external shared libraries into the wheels +for whl in wheelhouse/*.whl; do + auditwheel repair "$whl" --plat $PLAT -w /io/wheelhouse/ +done + +# Install packages and test +for PYBIN in /opt/python/*/bin/; do + "${PYBIN}/pip" install . --no-index -f /io/wheelhouse + # (cd "$HOME"; "${PYBIN}/nosetests" pymanylinuxdemo) +done \ No newline at end of file From fe7a60f7464089bf328012143bc0c09237abe5db Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 08:02:36 +0800 Subject: [PATCH 59/63] o --- travis/build-wheels.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 travis/build-wheels.sh diff --git a/travis/build-wheels.sh b/travis/build-wheels.sh old mode 100644 new mode 100755 From 19d34d98f2884ac4933dcd1fd70671f35ecb7a57 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 08:09:02 +0800 Subject: [PATCH 60/63] o --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 21117e7..f483ac4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ install: script: - nosetests -c nose.cfg before_deploy: -- docker run --rm -e PLAT=$PLAT -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh +- docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh deploy: # on: # tags: true From f4f5a156235ba02a50b26d3b8b550c6022ab4163 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 08:27:16 +0800 Subject: [PATCH 61/63] o --- travis/build-wheels.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/build-wheels.sh b/travis/build-wheels.sh index ed00408..a5477a1 100755 --- a/travis/build-wheels.sh +++ b/travis/build-wheels.sh @@ -17,6 +17,6 @@ done # Install packages and test for PYBIN in /opt/python/*/bin/; do - "${PYBIN}/pip" install . --no-index -f /io/wheelhouse + "${PYBIN}/pip" install cppjieba-py --no-index -f /io/wheelhouse # (cd "$HOME"; "${PYBIN}/nosetests" pymanylinuxdemo) done \ No newline at end of file From 9dd5b8627643f088c4569b9472283046f0df043b Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 09:28:31 +0800 Subject: [PATCH 62/63] skip bdist_wheel as pypi not allow --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f483ac4..e59711a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,7 +47,7 @@ deploy: # on: # tags: true provider: pypi - distributions: sdist bdist_wheel + distributions: sdist user: __token__ password: secure: fzIztIC2P0xWqEUHSYelFcnbC7aRV6OCgTtLExafvWRLl6W555X3iHUXI6KsVN7YNZOedXl0TqhFrlbCXky/Ub9ReoV6/htd3xsubQUXWWALwA3bynAH46AQIe21UH6dhC62LNMbjHVnrEAC4w9GVlsigqIYESavcXCypWwk+bziiYpTFpkTxnSAFeelL2PzOEzvjlMvIu7lN15+ODuk/HmAPKO2FTTqsr2B8xkYQpC09vK482hSVblFJqokSPQmxhRMgZ+Q03zNQNnsvgZY2J8KYqsRSH/A1JRwxNorefsM5yTfY1sORgDD1MpwcxPhF5FGLJTzNJ6jXBW5l/uUUjLXaEa4ohNA9xWoQ+QKwGkCjPA4N3F9zaBTyS7vK07pBxPXN2RyBfmUS4DkpbEyGj/29lq8Ixe3q5LneAItlzhLzhSxtcQmqTA1dyutetbn9kxg/u6J7TYCw4UlCCoOKKKBln9kifoa7cNwJoPHfXqnY1roqppSmvgIwHJf8wdtJlfGYSW6jNEUKQPgn41kEK6shO5ue3Sxe+qk0kM0d7DUpa4ZI+6vGIx6A187Xj2x5NlLii6zunJweEK/ifxOOgdwTtVl/kgPr89Nzely1Qipx15D4p2q4r1A7Mk//LXx3gWDcVGjQA0pB49q+kASonaD0eSD184Im+I94YhlQSI= From 95772e35ac3860418bf140f3c5a5b79c699db2a2 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 6 Nov 2019 09:32:37 +0800 Subject: [PATCH 63/63] o --- .travis.yml | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.travis.yml b/.travis.yml index e59711a..76e9794 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,24 @@ notifications: email: false -matrix: - include: - - sudo: required - services: - - docker - env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 - PLAT=manylinux1_x86_64 - - sudo: required - services: - - docker - env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 - PRE_CMD=linux32 - PLAT=manylinux1_i686 - - sudo: required - services: - - docker - env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 - PLAT=manylinux2010_x86_64 +# matrix: +# include: +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 +# PLAT=manylinux1_x86_64 +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 +# PRE_CMD=linux32 +# PLAT=manylinux1_i686 +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 +# PLAT=manylinux2010_x86_64 language: python python: @@ -41,8 +41,8 @@ install: - pip install ".[test]" script: - nosetests -c nose.cfg -before_deploy: -- docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh +# before_deploy: +# - docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh deploy: # on: # tags: true