diff --git a/.gitignore b/.gitignore index 9546843..5f5f4d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +performace_test/* # Byte-compiled / optimized / DLL files .DS_Store __pycache__/ diff --git a/.gitmodules b/.gitmodules index cce53b4..5a48eaa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,10 @@ -[submodule "pybind11"] - path = pybind11 - url = https://github.com/pybind/pybind11.git -[submodule "cppjieba"] - path = cppjieba +# [submodule "pybind11"] +# path = pybind11 +# active = false +# url = https://github.com/pybind/pybind11.git +# [submodule "cppjieba"] +# path = cppjieba +# url = https://github.com/yanyiwu/cppjieba.git +[submodule "cppjieba_py/cppjieba"] + path = cppjieba_py/cppjieba url = https://github.com/yanyiwu/cppjieba.git diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..76e9794 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,54 @@ +notifications: + email: false + +# matrix: +# include: +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 +# PLAT=manylinux1_x86_64 +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 +# PRE_CMD=linux32 +# PLAT=manylinux1_i686 +# - sudo: required +# services: +# - docker +# env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 +# PLAT=manylinux2010_x86_64 + +language: python +python: +- '2.7' +- '3.4' +- '3.5' +- '3.6' +sudo: false +git: + submodules: false +before_install: +- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +- sudo apt-get -y update +- sudo apt-get -y install build-essential +- sudo apt-get -y install g++-5 +- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1 +- g++ --version +- git submodule update --init cppjieba_py/cppjieba +install: +- pip install ".[test]" +script: +- nosetests -c nose.cfg +# before_deploy: +# - docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh +deploy: + # on: + # tags: true + provider: pypi + distributions: sdist + user: __token__ + password: + secure: fzIztIC2P0xWqEUHSYelFcnbC7aRV6OCgTtLExafvWRLl6W555X3iHUXI6KsVN7YNZOedXl0TqhFrlbCXky/Ub9ReoV6/htd3xsubQUXWWALwA3bynAH46AQIe21UH6dhC62LNMbjHVnrEAC4w9GVlsigqIYESavcXCypWwk+bziiYpTFpkTxnSAFeelL2PzOEzvjlMvIu7lN15+ODuk/HmAPKO2FTTqsr2B8xkYQpC09vK482hSVblFJqokSPQmxhRMgZ+Q03zNQNnsvgZY2J8KYqsRSH/A1JRwxNorefsM5yTfY1sORgDD1MpwcxPhF5FGLJTzNJ6jXBW5l/uUUjLXaEa4ohNA9xWoQ+QKwGkCjPA4N3F9zaBTyS7vK07pBxPXN2RyBfmUS4DkpbEyGj/29lq8Ixe3q5LneAItlzhLzhSxtcQmqTA1dyutetbn9kxg/u6J7TYCw4UlCCoOKKKBln9kifoa7cNwJoPHfXqnY1roqppSmvgIwHJf8wdtJlfGYSW6jNEUKQPgn41kEK6shO5ue3Sxe+qk0kM0d7DUpa4ZI+6vGIx6A187Xj2x5NlLii6zunJweEK/ifxOOgdwTtVl/kgPr89Nzely1Qipx15D4p2q4r1A7Mk//LXx3gWDcVGjQA0pB49q+kASonaD0eSD184Im+I94YhlQSI= + skip_existing: true diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..2042708 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include README.md +include setup.py +recursive-include cppjieba_py/src *.cpp +recursive-include cppjieba_py/cppjieba/include/cppjieba *.hpp +recursive-include cppjieba_py/cppjieba/dict *.utf8 +recursive-include cppjieba_py/cppjieba/deps *.hpp diff --git a/README.md b/README.md index 5cae8b3..77a1f50 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,45 @@ -# cppjieba-py +# cppjieba-py -cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 +[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) [![PyPI](https://img.shields.io/pypi/v/cppjieba-py.svg)](https://pypi.python.org/pypi/cppjieba-py) -## 性能 +cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 -测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 +由于只是对cppjieba的接口进行的封装,所以执行效率上非常接近于原cppjieba。 -| 方案 | 速度 | -| ------------- |:-------------:| -| cppjieba-py | 8s | -| jieba | 77s | +项目主要分为两个部分**libcppjieba** 为 cppjieba 的 python extension, +**cppjieba_py** 为使开发者平滑过渡到使用cppjieba-py而作的 python package。 +具体见[example.py](example.py)。 + +### 区别 + +* 原jieba的`.cut`族接口基本都由python的`iter()`函数包裹list结果来实现。 +* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。 +* `.del_word` 和`.suggest_freq` cppjieba没提供。 +* `POSTokenizer.lcut` 在`Tokenizer.tag` 下, 唯一一个只提供了list返回类型的接口。 + +## 安装 + +* pypi + + ```pip install cppjieba-py``` + + 或者你设置的安装源并未收录本项目 + + ```pip install -i https://pypi.org/simple/ cppjieba-py``` + +* 从发行包安装 + see [releases](https://github.com/bung87/cppjieba-py/releases) + + ```pip install https://github.com/bung87/cppjieba-py/files//cppjieba_py-.tar.gz``` + +* 从源代码安装 + + ``` + $ git clone --recursive https://github.com/bung87/cppjieba-py + $ pip install . # or + $ python setup.py install --old-and-unmanageable + without argument will install under egg dir,which cause libcppjieba found wrong default dictionaries directory + ``` ## 使用 @@ -18,10 +48,10 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装 ```python # -*- coding: utf-8 -*- -from cppjieba_py import jieba - -jieba_instance = jieba("cppjieba/dict/user.dict.utf8") -seg_list = jieba_instance.cut("我来到北京清华大学") +import cppjieba_py as jieba +# or use defualt Tokenizer: jieba.cut +jieba_instance = Tokenizer() +seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 @@ -34,12 +64,38 @@ print(", ".join(seg_list)) ``` -## 安装 +for more: [example.py](example.py) , [tests](tests) -* 从源代码安装 +## 性能 - ``` - $ git clone --recursive https://github.com/fantasy/cppjieba-py - $ python setup.py build - $ python setup.py install - ``` \ No newline at end of file +[performace_test/speed.py](performace_test/speed.py) + +测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。 + +`lcut HMM=False` + +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 10.642102 | +| jieba-fast==0.51 | 26.129298 | +| jieba==0.39 | 50.623866 | + +`lcut HMM=True` + +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 13.139232 | +| jieba-fast==0.51 | 34.574907 | +| jieba==0.39 | 1:26.756226 | + +`posseg.lcut` + +| 方案 | 速度 | +| ------------- |:-------------:| +| cppjieba-py | 20.382905 | +| jieba==0.39 | 1:19.411649 | + +## Test + +`pip install ".[test]"` +`nosetests -c nose.cfg` \ No newline at end of file diff --git a/cppjieba b/cppjieba deleted file mode 160000 index 6aff1f6..0000000 --- a/cppjieba +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6aff1f637c784c27af6bb0868a94ba22617e65b0 diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py new file mode 100644 index 0000000..7122fc4 --- /dev/null +++ b/cppjieba_py/__init__.py @@ -0,0 +1,40 @@ + +import libcppjieba +from libcppjieba import Tokenizer,add_word,tokenize,load_userdict,find,lookup_tag +from libcppjieba import lcut,lcut_for_search,initialize +from libcppjieba import cut_all as _cut_all,lcut_all + +def _iter_wraps_doc(origin): + return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1) + +def cut(*args,**kvargs): + it = libcppjieba.cut(*args,**kvargs) + return iter(it) + +def cut_all(*args,**kvargs): + it = _cut_all(*args,**kvargs) + return iter(it) + +cut.__doc__ = _iter_wraps_doc(libcppjieba.cut) + +def cut_for_search(*args,**kvargs): + it = libcppjieba.cut_for_search(*args,**kvargs) + return iter(it) + +cut_for_search.__doc__ = _iter_wraps_doc(libcppjieba.cut_for_search) + +def _cut(ins,*args,**kvargs): + it = ins.cut_internal(*args,**kvargs) + return iter(it) + +def _cut_for_search(ins,*args,**kvargs): + it = ins.cut_for_search_internal(*args,**kvargs) + return iter(it) + +_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal) + +_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal) + +setattr(Tokenizer,"cut",_cut) +setattr(Tokenizer,"cut_for_search",_cut_for_search) + diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py new file mode 100644 index 0000000..bf4e5f5 --- /dev/null +++ b/cppjieba_py/analyse.py @@ -0,0 +1,25 @@ +# pylint: disable=E0611 +from libcppjieba import get_default_keyword_extractor ,\ + get_default_textrank_extractor + +from libcppjieba import KeywordExtractor ,\ + TextRankExtractor +# pylint: enable=E0611 + +TextRank = TextRankExtractor +TFIDF = KeywordExtractor + +def _textrank(self,sentence, topK=20, withWeight=False): + if not withWeight: + return self.textrank_no_weight(sentence,topK) + else: + return self.textrank_with_weight(sentence,topK) + +setattr(TextRank,"textrank",_textrank) + +keywordExtractor = get_default_keyword_extractor() +textrankExtractor = get_default_textrank_extractor() + +extract_tags = keywordExtractor.extract_tags +textrank = textrankExtractor.textrank + diff --git a/cppjieba_py/cppjieba b/cppjieba_py/cppjieba new file mode 160000 index 0000000..79ffd00 --- /dev/null +++ b/cppjieba_py/cppjieba @@ -0,0 +1 @@ +Subproject commit 79ffd0097906bfaaa0fa8e5ce23f1a1d70ac5a81 diff --git a/cppjieba_py/posseg.py b/cppjieba_py/posseg.py new file mode 100644 index 0000000..1a10dc6 --- /dev/null +++ b/cppjieba_py/posseg.py @@ -0,0 +1,9 @@ + +import libcppjieba + +def cut(sentence,HMM=False): + it = libcppjieba.tag(sentence) + return iter(it) + +def lcut(sentence,HMM=False): + return libcppjieba.tag(sentence) \ No newline at end of file diff --git a/cppjieba_py/src/main.cpp b/cppjieba_py/src/main.cpp new file mode 100644 index 0000000..922ca34 --- /dev/null +++ b/cppjieba_py/src/main.cpp @@ -0,0 +1,453 @@ +#ifndef SITE_PACKAGE_PATH +#define SITE_PACKAGE_PATH STR_VALUE(SITE_PACKAGE_PATH) +#endif +#include +#include +#include +#include "cppjieba/Jieba.hpp" +#include "cppjieba/TextRankExtractor.hpp" +#include + +using namespace std; +namespace py = pybind11; + +const string DICT_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/jieba.dict.utf8"); +const string HMM_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/hmm_model.utf8"); +const string IDF_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/idf.utf8"); +const string STOP_WORD_PATH = string(SITE_PACKAGE_PATH) + string("cppjieba_py/cppjieba/dict/stop_words.utf8"); + +using Word = cppjieba::Word; + +using WordVector = vector; + +using WordsTaged = vector>; + +struct Tokenizer +{ + cppjieba::Jieba *jieba; + + public: + Tokenizer() + { + + jieba = new cppjieba::Jieba(DICT_PATH, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict) + { + + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, "", IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict, const string &user_dict) + { + + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, STOP_WORD_PATH); + }; + + Tokenizer(const string &main_dict, const string &user_dict, const string &stop_word_path) + { + jieba = new cppjieba::Jieba(main_dict, HMM_PATH, user_dict, IDF_PATH, stop_word_path); + }; + + vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) + { + vector> result; + vector words; + if (mode.compare("default") == 0) + { + jieba->Cut(sentence, words, HMM); + } + else + { + jieba->CutForSearch(sentence, words, HMM); + } + + vector::const_iterator it; + it = words.begin(); + while (it != words.end()) + { + result.push_back(make_tuple(it->word, it->unicode_offset, it->unicode_offset + it->unicode_length)); + ++it; + } + return result; + }; + + void load_userdict(const vector &buf) + { + jieba->LoadUserDict(buf); + }; + + void load_userdict(const set &buf) + { + jieba->LoadUserDict(buf); + }; + + void load_userdict(const string &path) + { + jieba->LoadUserDict(path); + }; + + WordVector cut_internal(const string &sentence, bool cut_all = false, bool HMM = true) + { + WordVector words; + if (!cut_all) + { + jieba->Cut(sentence, words, HMM); + } + else + { + jieba->CutAll(sentence, words); + } + return words; + }; + + vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) + { + vector words; + if (!cut_all) + { + jieba->Cut(sentence, words, HMM); + } + else + { + jieba->CutAll(sentence, words); + } + + return words; + }; + + vector cut_all(const string &sentence) + { + vector words; + jieba->CutAll(sentence, words); + return words; + }; + + vector lcut_all(const string &sentence) + { + vector words; + jieba->CutAll(sentence, words); + return words; + }; + + WordVector cut_for_search_internal(const string &sentence, bool HMM = true) + { + WordVector words; + jieba->CutForSearch(sentence, words, HMM); + return words; + }; + + vector lcut_for_search(const string &sentence, bool HMM = true) + { + vector words; + jieba->CutForSearch(sentence, words, HMM); + return words; + }; + + WordsTaged tag(const string &sentence) + { + WordsTaged words; + jieba->Tag(sentence, words); + return words; + }; + + bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) + { + return jieba->InsertUserWord(word, tag); + }; + + bool add_word(const string &word, int freq, const string &tag = cppjieba::UNKNOWN_TAG) + { + return jieba->InsertUserWord(word, freq, tag); + }; + + bool find(const string &word) + { + return jieba->Find(word); + }; + + string lookup_tag(const string &word) const + { + return jieba->LookupTag(word); + }; +}; + +namespace Jieba +{ +struct KeywordExtractor +{ + private: + Tokenizer *tokenizer; + cppjieba::KeywordExtractor *keywordExtractor; + + void initKeyowrdExtractor(const string &idfPath = IDF_PATH, + const string &stopWordPath = STOP_WORD_PATH) + { + keywordExtractor = new cppjieba::KeywordExtractor(tokenizer->jieba->GetDictTrie(), tokenizer->jieba->GetHMMModel(), idfPath, stopWordPath); + }; + + public: + KeywordExtractor(Tokenizer *t) : tokenizer(t) + { + initKeyowrdExtractor(); + }; + + KeywordExtractor(Tokenizer *t, const string &idfPath, + const string &stopWordPath) : tokenizer(t) + { + initKeyowrdExtractor(idfPath, stopWordPath); + }; + + vector extract_tags(const string &sentence, size_t topK = 20) + { + vector keywords; + keywordExtractor->Extract(sentence, keywords, topK); + return keywords; + }; +}; + +struct TextRankExtractor +{ + private: + Tokenizer *tokenizer; + cppjieba::TextRankExtractor *textRankExtractor; + + void initTextRankExtractor(const string &stopWordPath = STOP_WORD_PATH) + + { + textRankExtractor = new cppjieba::TextRankExtractor(tokenizer->jieba->GetDictTrie(), tokenizer->jieba->GetHMMModel(), stopWordPath); + }; + + public: + TextRankExtractor(Tokenizer *t) : tokenizer(t) + { + initTextRankExtractor(); + }; + + TextRankExtractor(Tokenizer *t, const string &stopWordPath) : tokenizer(t) + { + initTextRankExtractor(stopWordPath); + }; + + vector textrank_no_weight(const string &sentence, size_t topK = 20) + { + vector keywords; + textRankExtractor->Extract(sentence, keywords, topK); + return keywords; + }; + + vector> textrank_with_weight(const string &sentence, size_t topK = 20) + { + vector> keywords; + textRankExtractor->Extract(sentence, keywords, topK); + return keywords; + }; +}; + +Tokenizer *dt; +KeywordExtractor *keywordExtractor; +TextRankExtractor *textRankExtractor; + +void initialize() +{ + + dt = new Tokenizer(); +}; + +void init_check() +{ + if (!dt) + { + initialize(); + } +}; + +Tokenizer *get_default_tokenizer() +{ + init_check(); + return dt; +}; + +void init_check_textrank_extractor() +{ + if (!textRankExtractor) + { + textRankExtractor = new TextRankExtractor(get_default_tokenizer()); + } +}; + +TextRankExtractor *get_default_textrank_extractor() +{ + init_check_textrank_extractor(); + return textRankExtractor; +}; + +void init_check_keywordExtractor() +{ + if (!keywordExtractor) + { + keywordExtractor = new KeywordExtractor(get_default_tokenizer()); + } +}; + +KeywordExtractor *get_default_keyword_extractor() +{ + init_check_keywordExtractor(); + return keywordExtractor; +}; + +WordsTaged tag(const string &sentence) +{ + init_check(); + return dt->tag(sentence); +}; + +WordVector cut(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->cut_internal(sentence, cut_all, HMM); +}; + +vector lcut(const string &sentence, bool cut_all = false, bool HMM = true) +{ + init_check(); + return dt->lcut(sentence, cut_all, HMM); +}; + +vector lcut_all(const string &sentence) +{ + init_check(); + return dt->lcut_all(sentence); +}; + +WordVector cut_for_search(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->cut_for_search_internal(sentence, HMM); +}; + +vector cut_all(const string &sentence) +{ + init_check(); + return dt->cut_all(sentence); +}; + +vector lcut_for_search(const string &sentence, bool HMM = true) +{ + init_check(); + return dt->lcut_for_search(sentence, HMM); +}; + +bool add_word(const string &word, const string &tag = cppjieba::UNKNOWN_TAG) +{ + init_check(); + return dt->add_word(word, tag); +}; + +bool add_word(const string &word, int freq, const string &tag = cppjieba::UNKNOWN_TAG) +{ + return dt->add_word(word, freq, tag); +}; + +vector> tokenize(const string &sentence, const string &mode = "default", bool HMM = true) +{ + init_check(); + return dt->tokenize(sentence, mode, HMM); +}; + +void load_userdict2(const vector &buf) +{ + init_check(); + dt->load_userdict(buf); +}; + +void load_userdict3(const set &buf) +{ + init_check(); + dt->load_userdict(buf); +}; + +void load_userdict(const string &path) +{ + init_check(); + dt->load_userdict(path); +}; + +bool find(const string &word) +{ + init_check(); + return dt->find(word); +}; + +const string lookup_tag(const string &word) +{ + init_check(); + return dt->lookup_tag(word); +}; + +}; // namespace Jieba + +PYBIND11_MODULE(libcppjieba, m) +{ + m.doc() = "python extension for cppjieba"; // optional module docstring + + m.def("cut", &Jieba::cut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("lcut", &Jieba::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true); + m.def("cut_all", &Jieba::cut_all); + m.def("lcut_all", &Jieba::lcut_all); + m.def("lcut_for_search", &Jieba::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true); + m.def("cut_for_search", &Jieba::cut_for_search, py::arg("sentence"), py::arg("HMM") = true); + m.def("tag", &Jieba::tag, py::arg("sentence")); + m.def("initialize", &Jieba::initialize); + m.def("get_default_keyword_extractor", &Jieba::get_default_keyword_extractor); + m.def("get_default_textrank_extractor", &Jieba::get_default_textrank_extractor); + m.def("add_word", (bool (*)(const string &, const string &)) & Jieba::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG); + m.def("add_word", (bool (*)(const string &, int freq, const string & )) & Jieba::add_word, py::arg("word"), py::arg("freq"), py::arg("tag") = cppjieba::UNKNOWN_TAG); + m.def("tokenize", &Jieba::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true); + m.def("load_userdict", (void (*)(const vector &)) & Jieba::load_userdict2); + m.def("load_userdict", (void (*)(const set &)) & Jieba::load_userdict3); + m.def("load_userdict", (void (*)(const string &)) & Jieba::load_userdict); + m.def("find", &Jieba::find); + m.def("lookup_tag", &Jieba::lookup_tag); + + py::class_(m, "KeywordExtractor") + .def(py::init()) + .def(py::init()) + .def("extract_tags", &Jieba::KeywordExtractor::extract_tags, py::arg("sentence"), py::arg("topK") = 20); + + py::class_(m, "TextRankExtractor") + .def(py::init()) + .def(py::init()) + .def("textrank_no_weight", &Jieba::TextRankExtractor::textrank_no_weight, py::arg("sentence"), py::arg("topK") = 20) + .def("textrank_with_weight", &Jieba::TextRankExtractor::textrank_with_weight, py::arg("sentence"), py::arg("topK") = 20); + + py::class_(m, "Tokenizer") + .def(py::init<>()) + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def("cut_internal", &Tokenizer::cut_internal, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) + .def("lcut", &Tokenizer::lcut, py::arg("sentence"), py::arg("cut_all") = false, py::arg("HMM") = true) + .def("lcut_all", &Tokenizer::lcut_all) + .def("cut_all", &Tokenizer::cut_all) + .def("lcut_for_search", &Tokenizer::lcut_for_search, py::arg("sentence"), py::arg("HMM") = true) + .def("cut_for_search_internal", &Tokenizer::cut_for_search_internal, py::arg("sentence"), py::arg("HMM") = true) + .def("tag", &Tokenizer::tag, py::arg("sentence")) + .def("add_word", (bool (Tokenizer::*)(const string &, const string & )) & Tokenizer::add_word, py::arg("word"), py::arg("tag") = cppjieba::UNKNOWN_TAG) + .def("add_word", (bool (Tokenizer::*)(const string &, int freq, const string & )) & Tokenizer::add_word, py::arg("word"), py::arg("freq"), py::arg("tag") = cppjieba::UNKNOWN_TAG) + .def("tokenize", &Tokenizer::tokenize, py::arg("sentence"), py::arg("mode") = "default", py::arg("HMM") = true) + .def("load_userdict", (void (Tokenizer::*)(const vector &)) & Tokenizer::load_userdict) + .def("load_userdict", (void (Tokenizer::*)(const string &)) & Tokenizer::load_userdict) + .def("load_userdict", (void (Tokenizer::*)(const set &)) & Tokenizer::load_userdict) + .def("find", &Tokenizer::find) + .def("lookup_tag", &Tokenizer::lookup_tag); + // py::class_(m, "Word") + // .def_readonly("word", &Word::word) + // .def("__str__", [](const Word &v) { + // return v.word; + // }) + // .def("__repr__", [](const Word &v) { + // return v.word; + // }); +} \ No newline at end of file diff --git a/example.py b/example.py index 62f47a1..c11836c 100644 --- a/example.py +++ b/example.py @@ -1,19 +1,59 @@ -from cppjieba_py import jieba +from cppjieba_py import Tokenizer, cut, tokenize, cut_for_search, lcut, lcut_for_search, initialize, load_userdict +import cppjieba_py.posseg as pseg +import datetime +from cppjieba_py import analyse +from cppjieba_py.analyse import TextRank,TFIDF def main(): - jieba_instance = jieba("cppjieba/dict/user.dict.utf8") - seg_list = jieba_instance.cut("我来到北京清华大学") + jieba_instance = Tokenizer() + seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True) + print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 - seg_list = jieba_instance.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) - + + t1 = datetime.datetime.now() + initialize() + t2 = datetime.datetime.now() + print("initialize costs:%s" % (t2 - t1)) + + print(lcut("我来到北京清华大学")) + print(list(cut("我来到北京清华大学"))) + print(cut("我来到北京清华大学",cut_all=True)) + print(lcut_for_search("我来到北京清华大学")) + print(list(cut_for_search("我来到北京清华大学"))) + + print(pseg.lcut("我来到北京清华大学")) + print(list(pseg.cut("我来到北京清华大学"))) + + s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + r = analyse.extract_tags(s) + print(r) + + r = analyse.textrank(s, withWeight=True) + print(r) + + tr = TextRank(jieba_instance) + print(tr.textrank(s,topK=2,withWeight=True)) + + tf = TFIDF(jieba_instance) + print(tf.extract_tags(s,topK=10)) + + result = jieba_instance.tokenize('永和服装饰品有限公司') + for tk in result: + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + + print(tokenize('永和服装饰品有限公司',mode="search")) + + jieba_instance.load_userdict(["卧槽"]) + + load_userdict(set(["卧槽"])) if __name__ == '__main__': main() diff --git a/nose.cfg b/nose.cfg new file mode 100644 index 0000000..bc6caf5 --- /dev/null +++ b/nose.cfg @@ -0,0 +1,4 @@ +[nosetests] +where=tests +with-specplugin=1 +with-specselector=1 \ No newline at end of file diff --git a/performace_test/consistency.py b/performace_test/consistency.py new file mode 100644 index 0000000..322a47e --- /dev/null +++ b/performace_test/consistency.py @@ -0,0 +1,126 @@ +#encoding=utf-8 +import sys,os +import jieba +import cppjieba_py +# wget https://raw.githubusercontent.com/fxsjy/jieba/master/jieba/dict.txt -O performace_test/dict.txt + +from distutils.sysconfig import get_python_lib +site_package_dir = get_python_lib() +jieba_dict = os.path.join(site_package_dir,"jieba","dict.txt") +tokenizer = cppjieba_py.Tokenizer(jieba_dict) +HMM = False +if "HMM" in os.environ: + HMM = True + +def cuttest(test_sent): + result = jieba.lcut(test_sent,HMM=HMM) + # result2 = cppjieba_py.lcut(test_sent) + result2 = tokenizer.lcut(test_sent,HMM=HMM) + print(result) + print(result2) + assert result == result2 + + +if __name__ == "__main__": + # cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") + cuttest("我不喜欢日本和服。") + cuttest("雷猴回归人间。") + cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") + cuttest("我需要廉租房") + cuttest("永和服装饰品有限公司") + cuttest("我爱北京天安门") + cuttest("abc") + cuttest("隐马尔可夫") + cuttest("雷猴是个好网站") + # cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成") + # '一词' / '一', '词' + cuttest("草泥马和欺实马是今年的流行词汇") + cuttest("伊藤洋华堂总府店") + cuttest("中国科学院计算技术研究所") + cuttest("罗密欧与朱丽叶") + cuttest("我购买了道具和服装") + cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍") + cuttest("湖北省石首市") + cuttest("湖北省十堰市") + cuttest("总经理完成了这件事情") + cuttest("电脑修好了") + cuttest("做好了这件事情就一了百了了") + cuttest("人们审美的观点是不同的") + cuttest("我们买了一个美的空调") + cuttest("线程初始化时我们要注意") + cuttest("一个分子是由好多原子组织成的") + cuttest("祝你马到功成") + cuttest("他掉进了无底洞里") + cuttest("中国的首都是北京") + cuttest("孙君意") + cuttest("外交部发言人马朝旭") + cuttest("领导人会议和第四届东亚峰会") + cuttest("在过去的这五年") + cuttest("还需要很长的路要走") + cuttest("60周年首都阅兵") + cuttest("你好人们审美的观点是不同的") + cuttest("买水果然后来世博园") + cuttest("买水果然后去世博园") + cuttest("但是后来我才知道你是对的") + cuttest("存在即合理") + cuttest("的的的的的在的的的的就以和和和") + cuttest("I love你,不以为耻,反以为rong") + cuttest("因") + cuttest("") + cuttest("hello你好人们审美的观点是不同的") + cuttest("很好但主要是基于网页形式") + cuttest("hello你好人们审美的观点是不同的") + cuttest("为什么我不能拥有想要的生活") + cuttest("后来我才") + cuttest("此次来中国是为了") + cuttest("使用了它就可以解决一些问题") + cuttest(",使用了它就可以解决一些问题") + cuttest("其实使用了它就可以解决一些问题") + cuttest("好人使用了它就可以解决一些问题") + cuttest("是因为和国家") + cuttest("老年搜索还支持") + cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ") + cuttest("大") + cuttest("") + cuttest("他说的确实在理") + cuttest("长春市长春节讲话") + cuttest("结婚的和尚未结婚的") + cuttest("结合成分子时") + cuttest("旅游和服务是最好的") + cuttest("这件事情的确是我的错") + cuttest("供大家参考指正") + cuttest("哈尔滨政府公布塌桥原因") + cuttest("我在机场入口处") + cuttest("邢永臣摄影报道") + cuttest("BP神经网络如何训练才能在分类时增加区分度?") + cuttest("南京市长江大桥") + cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究") + cuttest('长春市长春药店') + cuttest('邓颖超生前最喜欢的衣服') + cuttest('胡锦涛是热爱世界和平的政治局常委') + cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪') + cuttest('一次性交多少钱') + cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') + cuttest('小和尚留了一个像大和尚一样的和尚头') + cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') + # cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') + # cuttest('AT&T是一件不错的公司,给你发offer了吗?') + # '了', '吗' / '了吗' + # cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') + # '是', '吗' / '是吗' + # cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') + # '他开' / '他', '开' + # cuttest('枪杆子中出政权') + # '中' / '中出' + cuttest('张三风同学走上了不归路') + # cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。') + # '阿Q' /'阿', 'Q' + # 'BB机' / 'BB', '机' + # 'AA制' / 'AA', '制' + # cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。') + # '1号店' / '1', '号店' + # '小S', '和', '大S', / '小', 'S', '和', '大', 'S' + # '3D' / '3', 'D' + # jieba.del_word('很赞') + cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?') + # '5%' / '5', '%' \ No newline at end of file diff --git a/performace_test/speed.py b/performace_test/speed.py new file mode 100644 index 0000000..6c6bb64 --- /dev/null +++ b/performace_test/speed.py @@ -0,0 +1,43 @@ +#encoding=utf-8 +import sys +import os +import random +import datetime + +#wget https://raw.githubusercontent.com/yanyiwu/practice/master/nodejs/nodejieba/performance/weicheng.utf8 -O performace_test/weicheng.utf8 + +if __name__ == "__main__": + + if sys.argv[1] == "cppjieba_py":# 0:00:03.861202 + import cppjieba_py as jieba + import cppjieba_py.posseg as pseg # 0:00:11.860994 + elif sys.argv[1] == "jieba": # 0:01:24.703040 + import jieba + import jieba.posseg as pseg # 0:00:00.048153 + elif sys.argv[1] == "jieba_fast": + import jieba_fast as jieba + import jieba_fast.posseg as pseg + + if len(sys.argv) == 4 and sys.argv[3] =="pseg": + method = pseg.lcut + else: + method = jieba.lcut + HMM = False + if len(sys.argv) >= 3 and sys.argv[2] =="hmm": + HMM = True + lines = [] + weicheng = os.path.join(os.path.dirname(__file__),"weicheng.utf8") + for line in open(weicheng): + lines.append(line.strip()) + result = [""] * 10 + result[random.randint(0, 9)] = '/'.join(str(method("南京长江大桥",HMM=HMM))) + starttime = datetime.datetime.now() + + for i in range(50): + for line in lines: + r = '/'.join(str(method(line,HMM=HMM))) + # print(r) + result[random.randint(0, 9)] = r + #result[random.randint(0, 9)] = jieba.cut(line) + endtime = datetime.datetime.now() + print (endtime - starttime) \ No newline at end of file diff --git a/pybind11 b/pybind11 deleted file mode 160000 index a303c6f..0000000 --- a/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a303c6fc479662fd53eaa8990dbc65b7de9b7deb diff --git a/setup.py b/setup.py index 257eee5..cc2b9de 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,14 @@ from setuptools.command.build_ext import build_ext import sys import setuptools +import os +import io +import subprocess +from os import path +from distutils.sysconfig import get_python_lib +site_package_dir = get_python_lib() + path.sep -__version__ = '0.0.1' - +__version__ = '0.0.13' class get_pybind_include(object): """Helper class to determine the pybind11 include path @@ -13,6 +18,11 @@ class get_pybind_include(object): method can be invoked. """ def __init__(self, user=False): + try: + import pybind11 + except ImportError: + if subprocess.call([sys.executable, '-m', 'pip', 'install', 'pybind11']): + raise RuntimeError('pybind11 install failed.') self.user = user def __str__(self): @@ -21,14 +31,17 @@ def __str__(self): ext_modules = [ Extension( - 'cppjieba_py', - ['src/main.cpp'], + 'libcppjieba', + # ['src/main.cpp'], + ["cppjieba_py/src/main.cpp"], include_dirs=[ # Path to pybind11 headers get_pybind_include(), get_pybind_include(user=True), - "cppjieba/include", - "cppjieba/deps" + # path.join(site_package_dir,"cppjieba",'include'), + # path.join(site_package_dir,"cppjieba",'deps') + "cppjieba_py/cppjieba/include", + "cppjieba_py/cppjieba/deps" ], language='c++' ), @@ -80,27 +93,56 @@ def build_extensions(self): if ct == 'unix': opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) + opts.append('-DSITE_PACKAGE_PATH="%s"' % + site_package_dir) opts.append(cpp_flag(self.compiler)) if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') - elif ct == 'msvc': + if ct == 'msvc': opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + opts.append('/DSITE_PACKAGE_PATH=\\"%s\\"' % site_package_dir) for ext in self.extensions: ext.extra_compile_args = opts build_ext.build_extensions(self) +install_requires = ['pybind11>=2.2.0',"setuptools >= 0.7.0"] + +extras_require = { + 'test': ['spec>=1.4.1','nose>=1.3.7'] + } + +if sys.version_info[0] <3: + extras_require["test"].append("pathlib2") + +classifiers = [ + 'License :: OSI Approved :: MIT License', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: C++', + 'Operating System :: Unix', + 'Topic :: Text Processing :: Linguistic', + 'Topic :: Software Development :: Libraries :: Python Modules' +] setup( name='cppjieba_py', version=__version__, - author='yeping zheng', - author_email='fantasy614@gmail.com', - url='https://github.com/fantasy/cppjieba-py', - description='A python extension for cppjieba', - long_description='', + author='bung87,yeping zheng', + url='https://github.com/bung87/cppjieba-py/', + description='python bindings of cppjieba', + long_description= io.open("README.md",'r', encoding="utf-8").read(), + long_description_content_type='text/markdown', + classifiers = classifiers, ext_modules=ext_modules, - install_requires=['pybind11>=2.2'], + packages=['cppjieba_py'], + include_package_data=True, + install_requires=install_requires, + extras_require=extras_require, cmdclass={'build_ext': BuildExt}, zip_safe=False, ) diff --git a/src/main.cpp b/src/main.cpp deleted file mode 100644 index c256236..0000000 --- a/src/main.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include "cppjieba/Jieba.hpp" -#include - -namespace py = pybind11; - -const std::string DICT_PATH = "cppjieba/dict/jieba.dict.utf8"; -const std::string HMM_PATH = "cppjieba/dict/hmm_model.utf8"; -const std::string IDF_PATH = "cppjieba/dict/idf.utf8"; -const std::string STOP_WORD_PATH = "cppjieba/dict/stop_words.utf8"; - -struct JiebaCpp -{ - cppjieba::Jieba jieba; - - public: - JiebaCpp(const std::string &USER_DICT_PATH) : jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH){}; - - std::vector cut(std::string &text) - { - std::vector words; - jieba.Cut(text, words, true); - return words; - }; - - std::vector cutForSearch(std::string &text) - { - std::vector words; - jieba.CutForSearch(text, words, true); - return words; - }; - - void InsertUserWord(std::string &word) - { - jieba.InsertUserWord(word); - }; - -}; - -PYBIND11_MODULE(cppjieba_py, m) -{ - m.doc() = "python extension for cppjieba"; // optional module docstring - - py::class_(m, "jieba") - .def(py::init()) - .def("cut", &JiebaCpp::cut) - .def("cut_for_search", &JiebaCpp::cutForSearch) - .def("add_word", &JiebaCpp::InsertUserWord); -} \ No newline at end of file diff --git a/tests/test_jieba.py b/tests/test_jieba.py new file mode 100644 index 0000000..f6d1623 --- /dev/null +++ b/tests/test_jieba.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +from spec import Spec +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path + +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +USER_DICT = str(DICT_DIR / "user.dict.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +import cppjieba_py as jieba + +class JiebaTest(Spec): + + class cut: + + def takes_arg1_as_sentence(self): + jieba.cut("") + + def takes_arg2_as_cut_all(self): + jieba.cut("", True) + + def takes_arg3_as_HMM(self): + jieba.cut("", True, True) + + def returns_iterator(self): + from collections import Iterable, Sequence + r = jieba.cut("", True, True) + iterable = isinstance(r, Iterable) + sequence = isinstance(r, Sequence) + assert iterable and not sequence + + class lcut: + def takes_arg1_as_sentence(self): + jieba.cut("") + + def takes_arg2_as_cut_all(self): + jieba.cut("", True) + + def takes_arg3_as_HMM(self): + jieba.cut("", True, True) + + def returns_list(self): + r = jieba.lcut("", True, True) + assert isinstance(r, list) + + class load_userdict: + def accept_string_as_arg(self): + jieba.load_userdict("") + + def accept_list_as_arg(self): + jieba.load_userdict([]) + + def accept_set_as_arg(self): + jieba.load_userdict(set([])) diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py new file mode 100644 index 0000000..b46cab4 --- /dev/null +++ b/tests/test_keyword_extractor.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +# pylint: disable=E1101 +from spec import Spec +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path + +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +IDF = str(DICT_DIR / "idf.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer +from cppjieba_py.analyse import KeywordExtractor + + +class KeywordExtractorrTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + cls.sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + cls.extractor = KeywordExtractor(KeywordExtractorrTest.dt) + + class init: + "__init__" + + def takes_arg1_as_tokenizer(self): + pass + + def takes_arg2_as_IDF_PATH_and_arg3_as_STOP_WORD_PATH(self): + KeywordExtractor(self.dt, IDF, STOP_WORD) + + class extract_tags: + + def takes_arg1_as_sentence(self): + self.extractor.extract_tags(self.sentence) + + def takes_arg2_as_topK(self): + self.extractor.extract_tags(self.sentence, topK=5) + + def returns_list(self): + r = self.extractor.extract_tags(self.sentence, topK=5) + assert isinstance(r, list) diff --git a/tests/test_textrank_extractor.py b/tests/test_textrank_extractor.py new file mode 100644 index 0000000..e033ff8 --- /dev/null +++ b/tests/test_textrank_extractor.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +# pylint: disable=E1101 +from spec import Spec +import sys +if sys.version_info[0] >=3: + from pathlib import Path +else: + from pathlib2 import Path + +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +IDF = str(DICT_DIR / "idf.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer +from cppjieba_py.analyse import TextRankExtractor + + +class TextRankExtractorTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + cls.sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" + cls.extractor = TextRankExtractor(cls.dt) + + class init: + "__init__" + + def takes_arg1_as_tokenizer(self): + pass + + def takes_arg2_as_STOP_WORD_PATH(self): + TextRankExtractor(self.dt, STOP_WORD) + + class textrank_no_weight: + + def takes_arg1_as_sentence(self): + self.extractor.textrank_no_weight(self.sentence) + + def takes_arg2_as_topK(self): + self.extractor.textrank_no_weight(self.sentence, topK=5) + + def returns_list(self): + r = self.extractor.textrank_no_weight(self.sentence, topK=5) + assert isinstance(r, list) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..0c68311 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +# pylint: disable=E1101 +from spec import Spec +import sys +if sys.version_info[0] >= 3: + from pathlib import Path +else: + from pathlib2 import Path + +DICT_DIR = Path("../cppjieba_py/cppjieba/dict") +DICT = str(DICT_DIR / "jieba.dict.utf8") +USER_DICT = str(DICT_DIR / "user.dict.utf8") +STOP_WORD = str(DICT_DIR / "stop_words.utf8") + +from cppjieba_py import Tokenizer + + +class TokenizerTest(Spec): + @classmethod + def setUpClass(cls): + cls.dt = Tokenizer(DICT) + cls.dt.add_word("区块链", 10, "nz") + + class init_0: + "__init__" + + def takes_arg1_as_main_dict_path(self): + pass + + def takes_arg2_as_user_dict_path(self): + Tokenizer(DICT, USER_DICT) + + def takes_arg3_as_stopword_path(self): + Tokenizer(DICT, USER_DICT, STOP_WORD) + + class cut: + + def takes_arg1_as_sentence(self): + self.dt.cut("") + + def takes_arg2_as_cut_all(self): + self.dt.cut("", True) + + def takes_arg3_as_HMM(self): + self.dt.cut("", True, True) + + def returns_iterator(self): + from collections import Iterable, Sequence + r = self.dt.cut("", True, True) + iterable = isinstance(r, Iterable) + sequence = isinstance(r, Sequence) + assert iterable and not sequence + + class lcut: + def takes_arg1_as_sentence(self): + self.dt.cut("") + + def takes_arg2_as_cut_all(self): + self.dt.cut("", True) + + def takes_arg3_as_HMM(self): + self.dt.cut("", True, True) + + def returns_list(self): + r = self.dt.lcut("", True, True) + assert isinstance(r, list) + + class load_userdict: + def accept_string_as_arg(self): + self.dt.load_userdict("") + + def accept_list_as_arg(self): + self.dt.load_userdict([]) + + def accept_set_as_arg(self): + self.dt.load_userdict(set([])) + + class add_word: + def takes_arg1_as_word(self): + self.dt.add_word("区块链") + + def takes_arg2_as_freq(self): + self.dt.add_word("区块链", 10) + + def takes_arg3_as_tag(self): + pass + + class find: + def takes_arg1_as_word(self): + self.dt.find("区块链") + + def can_find_added_word(self): + r = self.dt.find("区块链") + assert r == True + + class lookup_tag: + def takes_arg1_as_word(self): + self.dt.lookup_tag("区块链") + + def can_find_added_word(self): + self.dt.add_word("区块链", 10, "nz") # because of random test order + # from nose.plugins.skip import Skip + r = self.dt.lookup_tag("区块链") + # try: + assert r == "nz" + # except AssertionError: + # raise Skip() diff --git a/travis/build-wheels.sh b/travis/build-wheels.sh new file mode 100755 index 0000000..a5477a1 --- /dev/null +++ b/travis/build-wheels.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e -x + +# Install a system package required by our library +# yum install -y atlas-devel + +# Compile wheels +for PYBIN in /opt/python/*/bin; do + # "${PYBIN}/pip" install -r /io/dev-requirements.txt + "${PYBIN}/pip" wheel /io/ -w wheelhouse/ +done + +# Bundle external shared libraries into the wheels +for whl in wheelhouse/*.whl; do + auditwheel repair "$whl" --plat $PLAT -w /io/wheelhouse/ +done + +# Install packages and test +for PYBIN in /opt/python/*/bin/; do + "${PYBIN}/pip" install cppjieba-py --no-index -f /io/wheelhouse + # (cd "$HOME"; "${PYBIN}/nosetests" pymanylinuxdemo) +done \ No newline at end of file