Skip to content

Commit

Permalink
Merge pull request #16 from SeanLee97/develop
Browse files Browse the repository at this point in the history
[0.2.0] 1. 代码规范化 2.并行分词/词性标注 3.bumpversion版本管理
  • Loading branch information
SeanLee97 authored Apr 26, 2019
2 parents ad2d3c0 + 011eeb3 commit 25e7c1b
Show file tree
Hide file tree
Showing 30 changed files with 716 additions and 1,007 deletions.
6 changes: 6 additions & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[bumpversion]
files = setup.py xmnlp/__init__.py
current_version = 0.2.0
commit = True
tag = True

45 changes: 43 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<p align='center'>/ xmnlp /</p>
<p align='center'>小明NLP — 轻量级中文自然语言处理工具</p>
<p align='center'> A Lightweight Chinese Natural Language Processing Toolkit</p>
<p align='center'>v 0.1.8</p>
<p align='center'>v 0.2.0</p>

[![pypi](https://img.shields.io/badge/pypi-v0.1.8-blue.svg)](https://pypi.org/project/xmnlp/)
[![pypi](https://img.shields.io/badge/pypi-v0.2.0-blue.svg)](https://pypi.org/project/xmnlp/)
![python version](https://img.shields.io/badge/python-2%2C3-orange.svg)
![support os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg)
[![GitHub license](https://img.shields.io/github/license/SeanLee97/xmnlp.svg)](https://github.com/SeanLee97/xmnlp/blob/master/LICENSE)
Expand Down Expand Up @@ -58,6 +58,26 @@ print(xmnlp.seg(doc, hmm=True))
['自然语言', '处理', ':', '是', '人工智能', '和', '语言学', '领域', '的', '分支', '学科', '。', '在', '这此', '领域', '中', '探讨', '如何', '处理', '及', '运用', '自然语言', ';', '自然语言', '认知', '则', '是', '指让', '电脑', '“', '懂', '”', '人类', '的', '语言', '。', '自然语言', '生成', '系统', '把', '计算机', '数据', '转化', '为', '自然语言', '。', '自然语言', '理解', '系统', '把', '自然语言', '转化', '为', '计算机程序', '更', '易于', '处理', '的', '形式', '。']
```

#### 多进程分词
```
xmnlp.seg_parallel(texts[, hmm=False, n_jobs=-1])
- texts: list of str
- hmm: 是否使用 hmm 算法识别新词
```

example

```python
import xmnlp

xmnlp.seg_parallel(['结婚的和尚未结婚的都成了和尚',
'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',
'他正在量和服尺寸'])
```

[['结婚', '的', '和', '尚未', '结婚', '的', '都', '成', '了', '和尚'], ['工信处', '女干事', '每月', '经过', '下属', '科室', '都', '要', '亲口', '交代', '24', '口', '交换机', '等', '技术性', '器件', '的', '安装', '工作'], ['他', '正在', '量', '和服', '尺寸']]

### 词性标注

```python
Expand All @@ -77,6 +97,26 @@ print(list(xm.tag(doc)))
[('自然语言', 'l'), ('处理', 'v'), (':', 'un'), ('是', 'v'), ('人工智能', 'n'), ('和', 'c'), ('语言学', 'n'), ('领域', 'n'), ('的', 'uj'), ('分支', 'n'), ('学科', 'n'), ('。', 'un'), ('在', 'p'), ('这此', 'un'), ('领域', 'n'), ('中', 'f'), ('探讨', 'v'), ('如何', 'r'), ('处理', 'v'), ('及', 'c'), ('运用', 'vn'), ('自然语言', 'l'), (';', 'un'), ('自然语言', 'l'), ('认知', 'v'), ('则', 'd'), ('是', 'v'), ('指让', 'un'), ('电脑', 'n'), ('“', 'un'), ('懂', 'v'), ('”', 'un'), ('人类', 'n'), ('的', 'uj'), ('语言', 'n'), ('。', 'un'), ('自然语言', 'l'), ('生成', 'v'), ('系统', 'n'), ('把', 'p'), ('计算机', 'n'), ('数据', 'n'), ('转化', 'v'), ('为', 'p'), ('自然语言', 'l'), ('。', 'un'), ('自然语言', 'l'), ('理解', 'v'), ('系统', 'n'), ('把', 'p'), ('自然语言', 'l'), ('转化', 'v'), ('为', 'p'), ('计算机程序', 'n'), ('更', 'd'), ('易于', 'v'), ('处理', 'v'), ('的', 'uj'), ('形式', 'n'), ('。', 'un')]
```

#### 多进程词性标注
```
xmnlp.tag_parallel(texts[, hmm=False, n_jobs=-1])
- texts: list of str
- hmm: 是否使用 hmm 算法识别新词
```

example

```python
import xmnlp

xmnlp.tag_parallel(['结婚的和尚未结婚的都成了和尚',
'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',
'他正在量和服尺寸'])
```

[[('结婚', 'v'), ('的', 'uj'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'), ('的', 'uj'), ('都', 'd'), ('成', 'n'), ('了', 'ul'), ('和尚', 'nr')], [('工信处', 'n'), ('女干事', 'n'), ('每月', 'r'), ('经过', 'p'), ('下属', 'v'), ('科室', 'n'), ('都', 'd'), ('要', 'v'), ('亲口', 'n'), ('交代', 'n'), ('24', 'm'), ('口', 'q'), ('交换机', 'n'), ('等', 'u'), ('技术性', 'n'), ('器件', 'n'), ('的', 'uj'), ('安装', 'v'), ('工作', 'vn')], [('他', 'r'), ('正在', 't'), ('量', 'n'), ('和服', 'nz'), ('尺寸', 'n')]]

### 拼写检查
此功能基于symspell实现,建议用来检查词级别的错误,对于句子尚未能很好的解决拼写错误问题,**第一次加载字典的速度较慢(由词典大小决定)**

Expand Down Expand Up @@ -201,6 +241,7 @@ print(xmnlp.radical('自然语言处理'))
* snownlp情感分析语料

本项目受到以下项目的启发

* [jieba](https://github.com/fxsjy/jieba)
* [snownlp](https://github.com/isnowfy/snownlp)

Expand Down
13 changes: 11 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

import os
import sys
import subprocess
Expand All @@ -7,6 +9,13 @@
# email: xmlee97@gmail.com #
#--------------------------------------------#

if sys.version_info[0] == 2:
reload(sys)
sys.setdefaultencoding('utf8')


__version__ = '0.2.0'

LONGDESC = """
============
xmnlp
Expand Down Expand Up @@ -94,11 +103,11 @@ def read(fname):

setup(
name='xmnlp',
version='0.1.8',
version=__version__,
description='A Lightweight Chinese Natural Language Processing Toolkit',
long_description=LONGDESC,
keywords='chinese segmentation,chinese postager,chinese spell check,pinyin,chinese radical',
author='SeanLee97',
author='sean lee',
author_email='xmlee97@gmail.com',
license='MIT License',
platforms=['all'],
Expand Down
62 changes: 62 additions & 0 deletions tests/test_xmnlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-

import xmnlp
import pytest


@pytest.fixture
def postag_data():
return ['结婚的和尚未结婚的都成了和尚',
'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',
'他正在量和服尺寸']


def postag_equal(preds, trues):
for (y_pred, y_true) in zip(preds, trues):
assert y_pred == y_true


def test_seg(postag_data):
res = [['结婚', '的', '和', '尚未', '结婚', '的', '都', '成', '了', '和尚'],
['工信处', '女干事', '每月', '经过', '下属', '科室', '都', '要', '亲口', '交代', '24', '口', '交换机', '等', '技术性', '器件', '的', '安装', '工作'],
['他', '正在', '量', '和服', '尺寸']]
preds = [xmnlp.seg(data) for data in postag_data]
postag_equal(preds, res)


def test_seg_parallel(postag_data):
res = [['结婚', '的', '和', '尚未', '结婚', '的', '都', '成', '了', '和尚'],
['工信处', '女干事', '每月', '经过', '下属', '科室', '都', '要', '亲口', '交代', '24', '口', '交换机', '等', '技术性', '器件', '的', '安装', '工作'],
['他', '正在', '量', '和服', '尺寸']]
preds = xmnlp.seg_parallel(postag_data)
postag_equal(preds, res)


def test_tag(postag_data):
res = [[('结婚', 'v'), ('的', 'uj'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'), ('的', 'uj'), ('都', 'd'), ('成', 'n'), ('了', 'ul'), ('和尚', 'nr')],
[('工信处', 'n'), ('女干事', 'n'), ('每月', 'r'), ('经过', 'p'), ('下属', 'v'), ('科室', 'n'), ('都', 'd'), ('要', 'v'), ('亲口', 'n'), ('交代', 'n'), ('24', 'm'), ('口', 'q'), ('交换机', 'n'), ('等', 'u'), ('技术性', 'n'), ('器件', 'n'), ('的', 'uj'), ('安装', 'v'), ('工作', 'vn')],
[('他', 'r'), ('正在', 't'), ('量', 'n'), ('和服', 'nz'), ('尺寸', 'n')]]
preds = [xmnlp.tag(data) for data in postag_data]
for (y_pred, y_true) in zip(preds, res):
assert y_pred == y_true


def test_tag_parallel(postag_data):
res = [[('结婚', 'v'), ('的', 'uj'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'), ('的', 'uj'), ('都', 'd'), ('成', 'n'), ('了', 'ul'), ('和尚', 'nr')],
[('工信处', 'n'), ('女干事', 'n'), ('每月', 'r'), ('经过', 'p'), ('下属', 'v'), ('科室', 'n'), ('都', 'd'), ('要', 'v'), ('亲口', 'n'), ('交代', 'n'), ('24', 'm'), ('口', 'q'), ('交换机', 'n'), ('等', 'u'), ('技术性', 'n'), ('器件', 'n'), ('的', 'uj'), ('安装', 'v'), ('工作', 'vn')],
[('他', 'r'), ('正在', 't'), ('量', 'n'), ('和服', 'nz'), ('尺寸', 'n')]]
preds = xmnlp.tag_parallel(postag_data)
postag_equal(preds, res)


def test_pinyin():
assert ['ren', 'gong', 'zhi', 'neng'] == xmnlp.pinyin('人工智能')


def test_radical():
assert ['自', '灬', '讠', '言', '夂', '王'] == xmnlp.radical('自然语言处理')


def test_sentiment():
score = xmnlp.sentiment('这酒店真心不错')
assert score > 0.5
Loading

0 comments on commit 25e7c1b

Please sign in to comment.