bugfix: set PositionIncrementAttribute to fix indexing "ictclas_index…

…" and skip wrong position token
NLPIR-team · Sep 29, 2022 · 3df28bb · 3df28bb
1 parent 1e7b802
commit 3df28bb
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 18 deletions.
diff --git a/src/main/java/com/lingjoin/elasticsearch/index/IctclasTokenizer.java b/src/main/java/com/lingjoin/elasticsearch/index/IctclasTokenizer.java
@@ -10,6 +10,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.env.Environment;
 
@@ -30,6 +31,7 @@ public final class IctclasTokenizer extends Tokenizer {
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    private final PositionIncrementAttribute positionAtt = addAttribute(PositionIncrementAttribute.class);
 
     private static boolean initState = false;
 
@@ -152,37 +154,67 @@ private static synchronized void init(String data, String sLicenceCode, String u
     // 存储 tokenResults 读取进度
     private int cursor = 0;
     private int endPosition = 0;
+    // 上一个 positionAtt 不会 0 的位置, 用于比较是否加入了回溯的token
+    private int lastBeginPosition = 0;
+    private int lastEndPosition = 0;
     // 存储当前文本分词结果, 空表示当前文本为空或者没有开始分词
     private List<TokenResult> tempTokenResults;
 
     private void setTokenResults(List<TokenResult> tokenResults) {
         tempTokenResults = tokenResults;
     }
+
     private List<TokenResult> getTokenResults() {
         return Optional.ofNullable(this.tempTokenResults).orElse(List.of());
     }
-    @Override
-    public boolean incrementToken() throws IOException {
 
-        // 若 cursor 大于 tokenResults 的长度, 说明已经获取完当前数据
-        if (this.getTokenResults().size() <= cursor) {
-            return false;
-        }
+    @Override
+    public boolean incrementToken() {
         // 获取分词结果, 并进行判断是否有token
-        this.getTokenResults(input);
+        // this.getTokenResults(input);
         if (this.getTokenResults().size() == 0) {
             cursor = 0;
             return false;
         }
-        // 有分词结果, 开始进行输出
-        clearAttributes();
-        TokenResult currentToken = this.getTokenResults().get(cursor);
-        termAtt.append(currentToken.text);
-        offsetAtt.setOffset(correctOffset(currentToken.begin), correctOffset(currentToken.end));
-        typeAtt.setType(currentToken.pos);
-        cursor++;
-        this.endPosition = currentToken.end;
-        return true;
+
+        // 若当前的 Token 的 end 比存储的最大的 endPosition 要小或者想等, 那么后者应该只是前者更细的分词, 需要将
+        // PositionIncrementAttribute 设置为 0
+        // 使用循环用于跳过位置信息错误的 token
+        while(true) {
+            // 若 cursor 大于 tokenResults 的长度, 说明已经获取完当前数据
+            if (this.getTokenResults().size() <= cursor) {
+                return false;
+            }
+            // 清理当前的位置信息, 并开始解析数据
+            clearAttributes();
+            // 获取token数据, 并解析位置信息
+            TokenResult currentToken = this.getTokenResults().get(cursor);
+            // 过滤掉在超过上一个 PositionAtt 不为0的 token, 否则会抛出异常
+            if (lastBeginPosition > currentToken.begin){
+                cursor ++;
+                continue;
+            }
+            // 若当前 Token 的位置在上一个 PositionAtt 为非0 的 token 内部, 则设置当前的 positionAtt 为 0, 为前者的细粒度分词
+            // 否则为顺序的下一个分词结果, 设置 PositionAtt 为 1 并更新上一个 positionAtt=1 的token 位置
+            if (lastEndPosition >= currentToken.end) {
+                positionAtt.setPositionIncrement(0);
+            } else {
+                positionAtt.setPositionIncrement(1);
+                lastBeginPosition = currentToken.begin;
+                lastEndPosition = currentToken.end;
+            }
+            LOGGER.debug(String.format(
+                    "[%s] start:%s end:%s lastMax:%s position:%s",
+                    currentToken.text, currentToken.begin, currentToken.end, lastEndPosition, positionAtt.getPositionIncrement()
+            ));
+            // 更新其他位置信息
+            termAtt.append(currentToken.text);
+            offsetAtt.setOffset(correctOffset(currentToken.begin), correctOffset(currentToken.end));
+            typeAtt.setType(currentToken.pos);
+            cursor++;
+            this.endPosition = currentToken.end;
+            return true;
+        }
     }
 
     @Override
@@ -198,6 +230,9 @@ public void reset() throws IOException {
         // 分词, 并重置 cursor
         this.getTokenResults(input);
         cursor = 0;
+        endPosition = 0;
+        lastBeginPosition = 0;
+        lastEndPosition = 0;
     }
 
     /**

diff --git a/src/yamlRestTest/resources/rest-api-spec/test/analysis_ictclas/10_token.yml b/src/yamlRestTest/resources/rest-api-spec/test/analysis_ictclas/10_token.yml
@@ -1,5 +1,4 @@
----
-"Search":
+"List Text":
   - do:
       indices.analyze:
         body:

diff --git a/src/yamlRestTest/resources/rest-api-spec/test/analysis_ictclas/20_bulk.yml b/src/yamlRestTest/resources/rest-api-spec/test/analysis_ictclas/20_bulk.yml
@@ -0,0 +1,30 @@
+---
+"Bulk":
+  - do:
+      indices.create:
+        index: test_index
+        body:
+          mappings:
+            properties:
+              text:
+                type: text
+                analyzer: ictclas_index
+  - do:
+      bulk:
+        refresh: true
+        body:
+          - '{"index": {"_index": "test_index", "_id": "test_id"}}'
+          - '{ "text": "国务院办公厅转发商务部" }'
+          - '{"index": {"_index": "test_index", "_id": "test_id2"}}'
+          - '{ "text": "国务院办公厅转发商务部" }'
+          - '{"index": {"_index": "test_index"}}'
+          - '{ "text": "国务院办公厅转发商务部" }'
+          - '{"index": {"_index": "test_index"}}'
+          - '{ "text": "国务院办公厅转发商务部" }'
+  - match:
+      errors: false
+  - do:
+      count:
+        index: test_index
+
+  - match: { count: 4 }