fantasy · pc123s · Jun 5, 2018 · Jun 5, 2018 · Jun 5, 2018 · Jun 6, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+performace_test/*
 # Byte-compiled / optimized / DLL files
 .DS_Store
 __pycache__/

diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,10 @@
-[submodule "pybind11"]
-	path = pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "cppjieba"]
-	path = cppjieba
+# [submodule "pybind11"]
+# 	path = pybind11
+# 	active = false
+# 	url = https://github.com/pybind/pybind11.git
+# [submodule "cppjieba"]
+# 	path = cppjieba
+# 	url = https://github.com/yanyiwu/cppjieba.git
+[submodule "cppjieba_py/cppjieba"]
+	path = cppjieba_py/cppjieba
 	url = https://github.com/yanyiwu/cppjieba.git
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,54 @@
+notifications:
+  email: false
+
+# matrix:
+#   include:
+#     - sudo: required
+#       services:
+#         - docker
+#       env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
+#            PLAT=manylinux1_x86_64
+#     - sudo: required
+#       services:
+#         - docker
+#       env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686
+#            PRE_CMD=linux32
+#            PLAT=manylinux1_i686
+#     - sudo: required
+#       services:
+#         - docker
+#       env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64
+#            PLAT=manylinux2010_x86_64
+
+language: python
+python:
+- '2.7'
+- '3.4'
+- '3.5'
+- '3.6'
+sudo: false
+git:
+  submodules: false
+before_install:
+- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+- sudo apt-get -y update
+- sudo apt-get -y install build-essential
+- sudo apt-get -y install g++-5
+- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1
+- g++ --version
+- git submodule update --init cppjieba_py/cppjieba
+install:
+- pip install ".[test]"
+script:
+- nosetests -c nose.cfg
+# before_deploy:
+# - docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh
+deploy:
+  # on:
+  #   tags: true
+  provider: pypi
+  distributions: sdist 
+  user: __token__
+  password:
+    secure: fzIztIC2P0xWqEUHSYelFcnbC7aRV6OCgTtLExafvWRLl6W555X3iHUXI6KsVN7YNZOedXl0TqhFrlbCXky/Ub9ReoV6/htd3xsubQUXWWALwA3bynAH46AQIe21UH6dhC62LNMbjHVnrEAC4w9GVlsigqIYESavcXCypWwk+bziiYpTFpkTxnSAFeelL2PzOEzvjlMvIu7lN15+ODuk/HmAPKO2FTTqsr2B8xkYQpC09vK482hSVblFJqokSPQmxhRMgZ+Q03zNQNnsvgZY2J8KYqsRSH/A1JRwxNorefsM5yTfY1sORgDD1MpwcxPhF5FGLJTzNJ6jXBW5l/uUUjLXaEa4ohNA9xWoQ+QKwGkCjPA4N3F9zaBTyS7vK07pBxPXN2RyBfmUS4DkpbEyGj/29lq8Ixe3q5LneAItlzhLzhSxtcQmqTA1dyutetbn9kxg/u6J7TYCw4UlCCoOKKKBln9kifoa7cNwJoPHfXqnY1roqppSmvgIwHJf8wdtJlfGYSW6jNEUKQPgn41kEK6shO5ue3Sxe+qk0kM0d7DUpa4ZI+6vGIx6A187Xj2x5NlLii6zunJweEK/ifxOOgdwTtVl/kgPr89Nzely1Qipx15D4p2q4r1A7Mk//LXx3gWDcVGjQA0pB49q+kASonaD0eSD184Im+I94YhlQSI=
+  skip_existing: true
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,6 @@
+include README.md
+include setup.py
+recursive-include cppjieba_py/src *.cpp
+recursive-include cppjieba_py/cppjieba/include/cppjieba *.hpp
+recursive-include cppjieba_py/cppjieba/dict *.utf8
+recursive-include cppjieba_py/cppjieba/deps  *.hpp
diff --git a/README.md b/README.md
@@ -1,15 +1,45 @@
-# cppjieba-py
+# cppjieba-py   
 
-cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。
+[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) [![PyPI](https://img.shields.io/pypi/v/cppjieba-py.svg)](https://pypi.python.org/pypi/cppjieba-py)
 
-## 性能
+cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。 
 
-测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)：先按行读取文本围城到一个数组里，然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了，容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。
+由于只是对cppjieba的接口进行的封装，所以执行效率上非常接近于原cppjieba。  
 
-| 方案        | 速度             |
-| ------------- |:-------------:|
-| cppjieba-py      | 8s  |
-| jieba      | 77s    |
+项目主要分为两个部分**libcppjieba** 为 cppjieba 的 python extension，  
+**cppjieba_py** 为使开发者平滑过渡到使用cppjieba-py而作的 python package。 
+具体见[example.py](example.py)。  
+
+### 区别  
+
+* 原jieba的`.cut`族接口基本都由python的`iter()`函数包裹list结果来实现。  
+* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。   
+* `.del_word` 和`.suggest_freq` cppjieba没提供。  
+* `POSTokenizer.lcut` 在`Tokenizer.tag` 下， 唯一一个只提供了list返回类型的接口。
+
+## 安装  
+
+* pypi  
+
+	```pip install cppjieba-py```  
+
+	或者你设置的安装源并未收录本项目  
+
+	```pip install -i https://pypi.org/simple/ cppjieba-py```  
+
+* 从发行包安装  
+	see [releases](https://github.com/bung87/cppjieba-py/releases)  
+
+	```pip install https://github.com/bung87/cppjieba-py/files/<xxxxxxx>/cppjieba_py-<x.x.x>.tar.gz```  
+
+* 从源代码安装
+
+	```
+	$ git clone --recursive https://github.com/bung87/cppjieba-py
+	$ pip install . # or 
+	$ python setup.py install --old-and-unmanageable 
+	without argument will install under egg dir,which cause libcppjieba found wrong default dictionaries directory
+	```
 
 
 ## 使用
@@ -18,10 +48,10 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装
 
 ```python
 # -*- coding: utf-8 -*-
-from cppjieba_py import jieba 
-
-jieba_instance = jieba("cppjieba/dict/user.dict.utf8")
-seg_list = jieba_instance.cut("我来到北京清华大学")
+import cppjieba_py as jieba 
+# or use defualt Tokenizer: jieba.cut 
+jieba_instance = Tokenizer()
+seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True)
 print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 
 
@@ -34,12 +64,38 @@ print(", ".join(seg_list))
 
 ```
 
-## 安装
+for more: [example.py](example.py) , [tests](tests)
 
-* 从源代码安装
+## 性能  
 
-	```
-	$ git clone --recursive https://github.com/fantasy/cppjieba-py
-	$ python setup.py build 
-	$ python setup.py install 
-	```
+[performace_test/speed.py](performace_test/speed.py)  
+
+测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)：先按行读取文本围城到一个数组里，然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了，容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。
+
+`lcut HMM=False`  
+
+| 方案        | 速度             |
+| ------------- |:-------------:|
+| cppjieba-py      | 10.642102  |
+| jieba-fast==0.51      | 26.129298  |
+| jieba==0.39      | 50.623866    |
+
+`lcut HMM=True`  
+
+| 方案        | 速度             |
+| ------------- |:-------------:|
+| cppjieba-py      | 13.139232  |
+| jieba-fast==0.51      | 34.574907  |
+| jieba==0.39      | 1:26.756226    |
+
+`posseg.lcut`   
+
+| 方案        | 速度             |
+| ------------- |:-------------:|
+| cppjieba-py      | 20.382905  |
+| jieba==0.39    | 1:19.411649    |
+
+## Test  
+
+`pip install ".[test]"`  
+`nosetests -c nose.cfg`
diff --git a/cppjieba b/cppjieba
diff --git a/cppjieba_py/__init__.py b/cppjieba_py/__init__.py
@@ -0,0 +1,40 @@
+
+import libcppjieba
+from libcppjieba import Tokenizer,add_word,tokenize,load_userdict,find,lookup_tag
+from libcppjieba import lcut,lcut_for_search,initialize
+from libcppjieba import cut_all as _cut_all,lcut_all
+
+def _iter_wraps_doc(origin):
+    return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1)  
+
+def cut(*args,**kvargs):
+    it = libcppjieba.cut(*args,**kvargs)
+    return iter(it)
+
+def cut_all(*args,**kvargs):
+    it = _cut_all(*args,**kvargs)
+    return iter(it)
+
+cut.__doc__ = _iter_wraps_doc(libcppjieba.cut)
+
+def cut_for_search(*args,**kvargs):
+    it = libcppjieba.cut_for_search(*args,**kvargs)
+    return iter(it)
+
+cut_for_search.__doc__ = _iter_wraps_doc(libcppjieba.cut_for_search)
+
+def _cut(ins,*args,**kvargs):
+    it = ins.cut_internal(*args,**kvargs)
+    return iter(it)
+
+def _cut_for_search(ins,*args,**kvargs):
+    it = ins.cut_for_search_internal(*args,**kvargs)
+    return iter(it)
+
+_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal)
+
+_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal)
+
+setattr(Tokenizer,"cut",_cut)
+setattr(Tokenizer,"cut_for_search",_cut_for_search)
+
diff --git a/cppjieba_py/analyse.py b/cppjieba_py/analyse.py
@@ -0,0 +1,25 @@
+# pylint: disable=E0611
+from libcppjieba import get_default_keyword_extractor ,\
+    get_default_textrank_extractor
+
+from libcppjieba import KeywordExtractor ,\
+      TextRankExtractor 
+# pylint: enable=E0611
+
+TextRank = TextRankExtractor
+TFIDF = KeywordExtractor
+
+def _textrank(self,sentence, topK=20, withWeight=False):
+    if not withWeight:
+        return self.textrank_no_weight(sentence,topK)
+    else:
+        return self.textrank_with_weight(sentence,topK)
+
+setattr(TextRank,"textrank",_textrank)
+
+keywordExtractor = get_default_keyword_extractor()
+textrankExtractor = get_default_textrank_extractor()
+
+extract_tags = keywordExtractor.extract_tags
+textrank = textrankExtractor.textrank
+
diff --git a/cppjieba_py/cppjieba b/cppjieba_py/cppjieba
diff --git a/cppjieba_py/posseg.py b/cppjieba_py/posseg.py
@@ -0,0 +1,9 @@
+
+import libcppjieba
+
+def cut(sentence,HMM=False):
+       it = libcppjieba.tag(sentence)
+       return iter(it)
+
+def lcut(sentence,HMM=False):
+    return libcppjieba.tag(sentence)