Skip to content

Commit

Permalink
bugfix: set PositionIncrementAttribute to fix indexing "ictclas_index…
Browse files Browse the repository at this point in the history
…" and skip wrong position token
  • Loading branch information
yangyaofei committed Sep 29, 2022
1 parent 1e7b802 commit 3df28bb
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.env.Environment;

Expand All @@ -30,6 +31,7 @@ public final class IctclasTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute positionAtt = addAttribute(PositionIncrementAttribute.class);

private static boolean initState = false;

Expand Down Expand Up @@ -152,37 +154,67 @@ private static synchronized void init(String data, String sLicenceCode, String u
// 存储 tokenResults 读取进度
private int cursor = 0;
private int endPosition = 0;
// 上一个 positionAtt 不会 0 的位置, 用于比较是否加入了回溯的token
private int lastBeginPosition = 0;
private int lastEndPosition = 0;
// 存储当前文本分词结果, 空表示当前文本为空或者没有开始分词
private List<TokenResult> tempTokenResults;

private void setTokenResults(List<TokenResult> tokenResults) {
tempTokenResults = tokenResults;
}

private List<TokenResult> getTokenResults() {
return Optional.ofNullable(this.tempTokenResults).orElse(List.of());
}
@Override
public boolean incrementToken() throws IOException {

// 若 cursor 大于 tokenResults 的长度, 说明已经获取完当前数据
if (this.getTokenResults().size() <= cursor) {
return false;
}
@Override
public boolean incrementToken() {
// 获取分词结果, 并进行判断是否有token
this.getTokenResults(input);
// this.getTokenResults(input);
if (this.getTokenResults().size() == 0) {
cursor = 0;
return false;
}
// 有分词结果, 开始进行输出
clearAttributes();
TokenResult currentToken = this.getTokenResults().get(cursor);
termAtt.append(currentToken.text);
offsetAtt.setOffset(correctOffset(currentToken.begin), correctOffset(currentToken.end));
typeAtt.setType(currentToken.pos);
cursor++;
this.endPosition = currentToken.end;
return true;

// 若当前的 Token 的 end 比存储的最大的 endPosition 要小或者想等, 那么后者应该只是前者更细的分词, 需要将
// PositionIncrementAttribute 设置为 0
// 使用循环用于跳过位置信息错误的 token
while(true) {
// 若 cursor 大于 tokenResults 的长度, 说明已经获取完当前数据
if (this.getTokenResults().size() <= cursor) {
return false;
}
// 清理当前的位置信息, 并开始解析数据
clearAttributes();
// 获取token数据, 并解析位置信息
TokenResult currentToken = this.getTokenResults().get(cursor);
// 过滤掉在超过上一个 PositionAtt 不为0的 token, 否则会抛出异常
if (lastBeginPosition > currentToken.begin){
cursor ++;
continue;
}
// 若当前 Token 的位置在上一个 PositionAtt 为非0 的 token 内部, 则设置当前的 positionAtt 为 0, 为前者的细粒度分词
// 否则为顺序的下一个分词结果, 设置 PositionAtt 为 1 并更新上一个 positionAtt=1 的token 位置
if (lastEndPosition >= currentToken.end) {
positionAtt.setPositionIncrement(0);
} else {
positionAtt.setPositionIncrement(1);
lastBeginPosition = currentToken.begin;
lastEndPosition = currentToken.end;
}
LOGGER.debug(String.format(
"[%s] start:%s end:%s lastMax:%s position:%s",
currentToken.text, currentToken.begin, currentToken.end, lastEndPosition, positionAtt.getPositionIncrement()
));
// 更新其他位置信息
termAtt.append(currentToken.text);
offsetAtt.setOffset(correctOffset(currentToken.begin), correctOffset(currentToken.end));
typeAtt.setType(currentToken.pos);
cursor++;
this.endPosition = currentToken.end;
return true;
}
}

@Override
Expand All @@ -198,6 +230,9 @@ public void reset() throws IOException {
// 分词, 并重置 cursor
this.getTokenResults(input);
cursor = 0;
endPosition = 0;
lastBeginPosition = 0;
lastEndPosition = 0;
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
---
"Search":
"List Text":
- do:
indices.analyze:
body:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
"Bulk":
- do:
indices.create:
index: test_index
body:
mappings:
properties:
text:
type: text
analyzer: ictclas_index
- do:
bulk:
refresh: true
body:
- '{"index": {"_index": "test_index", "_id": "test_id"}}'
- '{ "text": "国务院办公厅转发商务部" }'
- '{"index": {"_index": "test_index", "_id": "test_id2"}}'
- '{ "text": "国务院办公厅转发商务部" }'
- '{"index": {"_index": "test_index"}}'
- '{ "text": "国务院办公厅转发商务部" }'
- '{"index": {"_index": "test_index"}}'
- '{ "text": "国务院办公厅转发商务部" }'
- match:
errors: false
- do:
count:
index: test_index

- match: { count: 4 }

0 comments on commit 3df28bb

Please sign in to comment.