Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
e085e9f
wrap core apis
bung87 Jun 5, 2018
002716a
cut* and lcut*
bung87 Jun 5, 2018
fe0f959
posseg
bung87 Jun 5, 2018
ccb8a57
python yield from vector<Word> significant slow than vector<string>,s…
bung87 Jun 6, 2018
e688f94
add .initialize
bung87 Jun 6, 2018
f15d9a0
renames files
bung87 Jun 6, 2018
0a01f1f
update readme
bung87 Jun 6, 2018
25ed3ff
update version
bung87 Jun 6, 2018
92bde24
update readme
bung87 Jun 6, 2018
2f5ee88
compare to jieba_fast
bung87 Jun 6, 2018
ec5eb8d
posseg
bung87 Jun 6, 2018
205a137
analyse.extract_tags
bung87 Jun 6, 2018
05c5820
export .cut_all
bung87 Jun 6, 2018
d36a195
analyse.textrank
bung87 Jun 6, 2018
74854b3
typo
bung87 Jun 6, 2018
449d47b
v0.0.5
bung87 Jun 6, 2018
cb2d7e6
o
bung87 Jun 6, 2018
ce2984e
.add_word ,tokenize ,and clean names
bung87 Jun 7, 2018
1112295
update version,doc
bung87 Jun 7, 2018
1bc799a
update readme
bung87 Jun 7, 2018
83bd055
change cppjieba to mine fork version in case needs modify the cpp files
bung87 Jun 8, 2018
c169c54
add .load_userdict
bung87 Jun 8, 2018
f30e32f
fix while install locally
bung87 Jun 8, 2018
811113c
o
bung87 Jun 8, 2018
74dbccf
fix dict runtime relative path
bung87 Jun 8, 2018
6de04ea
add test and travis file
bung87 Jun 8, 2018
263cb84
o
bung87 Jun 8, 2018
cea0079
o
bung87 Jun 8, 2018
981a9f7
o
bung87 Jun 9, 2018
8e420ed
o
bung87 Jun 9, 2018
655ff3e
o
bung87 Jun 9, 2018
99e20e6
o
bung87 Jun 9, 2018
a2b139c
classifiers info [skip ci]
bung87 Jun 9, 2018
b735b07
v0.0.8
bung87 Jun 9, 2018
1a74c87
update classifiers
bung87 Jun 9, 2018
a78e91d
fix cut_all condition
bung87 Jun 10, 2018
ed326b6
fix cut_all condition
bung87 Jun 10, 2018
5718476
o
bung87 Jun 11, 2018
3e3727d
add posseg.cut speed test,jieba wins
bung87 Aug 18, 2018
3902492
posseg api compatibility HMM will be ignored in fact
bung87 Aug 18, 2018
3095fc2
release 0.0.10
bung87 Aug 18, 2018
e89aa3b
use lcut actually perform cut for speed test
bung87 Aug 18, 2018
454a93d
improve performance comparation
bung87 Aug 18, 2018
148db6b
o
bung87 Aug 18, 2018
f506c2f
o
bung87 Aug 18, 2018
0741718
update consistency test commented code will fails whether using hmm
bung87 Aug 19, 2018
6d4bf8a
open readme as utf8 in case user platform default encoding is not utf8
bung87 Mar 16, 2019
0428594
open readme as utf8 in case user platform default encoding is not utf8
bung87 Mar 16, 2019
22bd465
readme travis pypi
bung87 Nov 4, 2019
50843a4
pybind11==2.2
bung87 Nov 4, 2019
b1d2a5a
o
bung87 Nov 4, 2019
775d940
ignore submodules
bung87 Nov 4, 2019
70119a8
o
bung87 Nov 4, 2019
3d1a553
o
bung87 Nov 4, 2019
b821939
o
bung87 Nov 4, 2019
e3a89f1
o
bung87 Nov 4, 2019
dc7bfe5
v0.0.13
bung87 Nov 5, 2019
bc0e457
o
bung87 Nov 5, 2019
fe7a60f
o
bung87 Nov 6, 2019
19d34d9
o
bung87 Nov 6, 2019
f4f5a15
o
bung87 Nov 6, 2019
9dd5b86
skip bdist_wheel as pypi not allow
bung87 Nov 6, 2019
95772e3
o
bung87 Nov 6, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
performace_test/*
# Byte-compiled / optimized / DLL files
.DS_Store
__pycache__/
Expand Down
14 changes: 9 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
[submodule "pybind11"]
path = pybind11
url = https://github.com/pybind/pybind11.git
[submodule "cppjieba"]
path = cppjieba
# [submodule "pybind11"]
# path = pybind11
# active = false
# url = https://github.com/pybind/pybind11.git
# [submodule "cppjieba"]
# path = cppjieba
# url = https://github.com/yanyiwu/cppjieba.git
[submodule "cppjieba_py/cppjieba"]
path = cppjieba_py/cppjieba
url = https://github.com/yanyiwu/cppjieba.git
54 changes: 54 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
notifications:
email: false

# matrix:
# include:
# - sudo: required
# services:
# - docker
# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
# PLAT=manylinux1_x86_64
# - sudo: required
# services:
# - docker
# env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686
# PRE_CMD=linux32
# PLAT=manylinux1_i686
# - sudo: required
# services:
# - docker
# env: DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64
# PLAT=manylinux2010_x86_64

language: python
python:
- '2.7'
- '3.4'
- '3.5'
- '3.6'
sudo: false
git:
submodules: false
before_install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get -y update
- sudo apt-get -y install build-essential
- sudo apt-get -y install g++-5
- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 1
- g++ --version
- git submodule update --init cppjieba_py/cppjieba
install:
- pip install ".[test]"
script:
- nosetests -c nose.cfg
# before_deploy:
# - docker run --rm -e PLAT=$PLAT -v "$(pwd)":/io $DOCKER_IMAGE $PRE_CMD /io/travis/build-wheels.sh
deploy:
# on:
# tags: true
provider: pypi
distributions: sdist
user: __token__
password:
secure: fzIztIC2P0xWqEUHSYelFcnbC7aRV6OCgTtLExafvWRLl6W555X3iHUXI6KsVN7YNZOedXl0TqhFrlbCXky/Ub9ReoV6/htd3xsubQUXWWALwA3bynAH46AQIe21UH6dhC62LNMbjHVnrEAC4w9GVlsigqIYESavcXCypWwk+bziiYpTFpkTxnSAFeelL2PzOEzvjlMvIu7lN15+ODuk/HmAPKO2FTTqsr2B8xkYQpC09vK482hSVblFJqokSPQmxhRMgZ+Q03zNQNnsvgZY2J8KYqsRSH/A1JRwxNorefsM5yTfY1sORgDD1MpwcxPhF5FGLJTzNJ6jXBW5l/uUUjLXaEa4ohNA9xWoQ+QKwGkCjPA4N3F9zaBTyS7vK07pBxPXN2RyBfmUS4DkpbEyGj/29lq8Ixe3q5LneAItlzhLzhSxtcQmqTA1dyutetbn9kxg/u6J7TYCw4UlCCoOKKKBln9kifoa7cNwJoPHfXqnY1roqppSmvgIwHJf8wdtJlfGYSW6jNEUKQPgn41kEK6shO5ue3Sxe+qk0kM0d7DUpa4ZI+6vGIx6A187Xj2x5NlLii6zunJweEK/ifxOOgdwTtVl/kgPr89Nzely1Qipx15D4p2q4r1A7Mk//LXx3gWDcVGjQA0pB49q+kASonaD0eSD184Im+I94YhlQSI=
skip_existing: true
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include README.md
include setup.py
recursive-include cppjieba_py/src *.cpp
recursive-include cppjieba_py/cppjieba/include/cppjieba *.hpp
recursive-include cppjieba_py/cppjieba/dict *.utf8
recursive-include cppjieba_py/cppjieba/deps *.hpp
94 changes: 75 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,45 @@
# cppjieba-py
# cppjieba-py

cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。
[![Build Status](https://travis-ci.org/bung87/cppjieba-py.svg?branch=master)](https://travis-ci.org/bung87/cppjieba-py) [![PyPI](https://img.shields.io/pypi/v/cppjieba-py.svg)](https://pypi.python.org/pypi/cppjieba-py)

## 性能
cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装。

测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。
由于只是对cppjieba的接口进行的封装,所以执行效率上非常接近于原cppjieba。

| 方案 | 速度 |
| ------------- |:-------------:|
| cppjieba-py | 8s |
| jieba | 77s |
项目主要分为两个部分**libcppjieba** 为 cppjieba 的 python extension,
**cppjieba_py** 为使开发者平滑过渡到使用cppjieba-py而作的 python package。
具体见[example.py](example.py)。

### 区别

* 原jieba的`.cut`族接口基本都由python的`iter()`函数包裹list结果来实现。
* 原jieba的`.set_*`方法基本都由class传入初始化的参数来完成。
* `.del_word` 和`.suggest_freq` cppjieba没提供。
* `POSTokenizer.lcut` 在`Tokenizer.tag` 下, 唯一一个只提供了list返回类型的接口。

## 安装

* pypi

```pip install cppjieba-py```

或者你设置的安装源并未收录本项目

```pip install -i https://pypi.org/simple/ cppjieba-py```

* 从发行包安装
see [releases](https://github.com/bung87/cppjieba-py/releases)

```pip install https://github.com/bung87/cppjieba-py/files/<xxxxxxx>/cppjieba_py-<x.x.x>.tar.gz```

* 从源代码安装

```
$ git clone --recursive https://github.com/bung87/cppjieba-py
$ pip install . # or
$ python setup.py install --old-and-unmanageable
without argument will install under egg dir,which cause libcppjieba found wrong default dictionaries directory
```


## 使用
Expand All @@ -18,10 +48,10 @@ cppjieba-py 是 [cppjieba](https://github.com/yanyiwu/cppjieba)的 Python 封装

```python
# -*- coding: utf-8 -*-
from cppjieba_py import jieba

jieba_instance = jieba("cppjieba/dict/user.dict.utf8")
seg_list = jieba_instance.cut("我来到北京清华大学")
import cppjieba_py as jieba
# or use defualt Tokenizer: jieba.cut
jieba_instance = Tokenizer()
seg_list = jieba_instance.cut("我来到北京清华大学",cut_all = True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式


Expand All @@ -34,12 +64,38 @@ print(", ".join(seg_list))

```

## 安装
for more: [example.py](example.py) , [tests](tests)

* 从源代码安装
## 性能

```
$ git clone --recursive https://github.com/fantasy/cppjieba-py
$ python setup.py build
$ python setup.py install
```
[performace_test/speed.py](performace_test/speed.py)

测试[方案](https://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html):先按行读取文本围城到一个数组里,然后循环对围城每行文字作为一个句子进行分词。因为只对围城这本书分词一遍太快了,容易误差。 所以循环对围城这本书分词50次。基本上每次分词耗时都很稳定。 分词算法都是采用【精确模式】。

`lcut HMM=False`

| 方案 | 速度 |
| ------------- |:-------------:|
| cppjieba-py | 10.642102 |
| jieba-fast==0.51 | 26.129298 |
| jieba==0.39 | 50.623866 |

`lcut HMM=True`

| 方案 | 速度 |
| ------------- |:-------------:|
| cppjieba-py | 13.139232 |
| jieba-fast==0.51 | 34.574907 |
| jieba==0.39 | 1:26.756226 |

`posseg.lcut`

| 方案 | 速度 |
| ------------- |:-------------:|
| cppjieba-py | 20.382905 |
| jieba==0.39 | 1:19.411649 |

## Test

`pip install ".[test]"`
`nosetests -c nose.cfg`
1 change: 0 additions & 1 deletion cppjieba
Submodule cppjieba deleted from 6aff1f
40 changes: 40 additions & 0 deletions cppjieba_py/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

import libcppjieba
from libcppjieba import Tokenizer,add_word,tokenize,load_userdict,find,lookup_tag
from libcppjieba import lcut,lcut_for_search,initialize
from libcppjieba import cut_all as _cut_all,lcut_all

def _iter_wraps_doc(origin):
return origin.__doc__.replace(origin.__name__,"Iterator wraps %s" % origin.__name__,1)

def cut(*args,**kvargs):
it = libcppjieba.cut(*args,**kvargs)
return iter(it)

def cut_all(*args,**kvargs):
it = _cut_all(*args,**kvargs)
return iter(it)

cut.__doc__ = _iter_wraps_doc(libcppjieba.cut)

def cut_for_search(*args,**kvargs):
it = libcppjieba.cut_for_search(*args,**kvargs)
return iter(it)

cut_for_search.__doc__ = _iter_wraps_doc(libcppjieba.cut_for_search)

def _cut(ins,*args,**kvargs):
it = ins.cut_internal(*args,**kvargs)
return iter(it)

def _cut_for_search(ins,*args,**kvargs):
it = ins.cut_for_search_internal(*args,**kvargs)
return iter(it)

_cut.__doc__ = _iter_wraps_doc(Tokenizer.cut_internal)

_cut_for_search.__doc__ = _iter_wraps_doc(Tokenizer.cut_for_search_internal)

setattr(Tokenizer,"cut",_cut)
setattr(Tokenizer,"cut_for_search",_cut_for_search)

25 changes: 25 additions & 0 deletions cppjieba_py/analyse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# pylint: disable=E0611
from libcppjieba import get_default_keyword_extractor ,\
get_default_textrank_extractor

from libcppjieba import KeywordExtractor ,\
TextRankExtractor
# pylint: enable=E0611

TextRank = TextRankExtractor
TFIDF = KeywordExtractor

def _textrank(self,sentence, topK=20, withWeight=False):
if not withWeight:
return self.textrank_no_weight(sentence,topK)
else:
return self.textrank_with_weight(sentence,topK)

setattr(TextRank,"textrank",_textrank)

keywordExtractor = get_default_keyword_extractor()
textrankExtractor = get_default_textrank_extractor()

extract_tags = keywordExtractor.extract_tags
textrank = textrankExtractor.textrank

1 change: 1 addition & 0 deletions cppjieba_py/cppjieba
Submodule cppjieba added at 79ffd0
9 changes: 9 additions & 0 deletions cppjieba_py/posseg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

import libcppjieba

def cut(sentence,HMM=False):
it = libcppjieba.tag(sentence)
return iter(it)

def lcut(sentence,HMM=False):
return libcppjieba.tag(sentence)
Loading