From 3008741c602a3cd8dab3145e7c65979b07709b52 Mon Sep 17 00:00:00 2001 From: manojks1999 <9743manoj@gmail.com> Date: Wed, 21 Aug 2024 23:36:59 +0530 Subject: [PATCH 1/2] Check for non braking spaces --- budou/tinysegmentersegmenter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/budou/tinysegmentersegmenter.py b/budou/tinysegmentersegmenter.py index 862c7a4..725a992 100644 --- a/budou/tinysegmentersegmenter.py +++ b/budou/tinysegmentersegmenter.py @@ -27,6 +27,7 @@ import tinysegmenter from .segmenter import Segmenter from .chunk import Chunk, ChunkList +import re _PARTICLES = {u'か', u'かしら', u'から', u'が', u'くらい', u'けれども', u'こそ', u'さ', u'さえ', u'しか', u'だけ', u'だに', u'だの', u'て', u'で', u'でも', @@ -89,10 +90,14 @@ def segment(self, source, language=None): for word in results: word = word.strip() if not word: - continue - if source[seek: seek + len(word)] != word: + continue + + # Handle non-breaking spaces + preprocessed_word = re.sub(r"[\u00a0\u2000-\u2009\u202F\u205F]", ' ', word) + + if source[seek: seek + len(preprocessed_word)] != preprocessed_word: assert source[seek] == ' ' - assert source[seek + 1: seek + len(word) + 1] == word + assert source[seek + 1: seek + len(preprocessed_word) + 1] == preprocessed_word chunks.append(Chunk.space()) seek += 1 From 355a8ec4023d1fa69a7995e37b223bc0aca8df7e Mon Sep 17 00:00:00 2001 From: manojks1999 <9743manoj@gmail.com> Date: Thu, 22 Aug 2024 00:01:53 +0530 Subject: [PATCH 2/2] added extra spaces --- budou/tinysegmentersegmenter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/budou/tinysegmentersegmenter.py b/budou/tinysegmentersegmenter.py index 725a992..5bff571 100644 --- a/budou/tinysegmentersegmenter.py +++ b/budou/tinysegmentersegmenter.py @@ -93,7 +93,7 @@ def segment(self, source, language=None): continue # Handle non-breaking spaces - preprocessed_word = re.sub(r"[\u00a0\u2000-\u2009\u202F\u205F]", ' ', word) + preprocessed_word = re.sub(r"[\u00a0\u2000-\u2009\u202F\u205F\u2012]", ' ', word) if source[seek: seek + len(preprocessed_word)] != preprocessed_word: assert source[seek] == ' '