balibabu
commited on
Commit
·
2b252d0
1
Parent(s):
240ac86
feat: Added explanation on the parsing method of knowledge graph #1594 (#1916)
Browse files### What problem does this PR solve?
feat: Added explanation on the parsing method of knowledge graph #1594
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- web/src/assets/svg/chunk-method/knowledge-graph-01.svg +0 -0
- web/src/assets/svg/chunk-method/knowledge-graph-02.svg +0 -0
- web/src/components/chunk-method-modal/hooks.ts +2 -2
- web/src/locales/en.ts +8 -1
- web/src/locales/zh-traditional.ts +8 -1
- web/src/locales/zh.ts +8 -1
- web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx +2 -1
- web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts +6 -1
- web/src/pages/add-knowledge/components/knowledge-setting/utils.ts +1 -0
web/src/assets/svg/chunk-method/knowledge-graph-01.svg
ADDED
|
|
web/src/assets/svg/chunk-method/knowledge-graph-02.svg
ADDED
|
|
web/src/components/chunk-method-modal/hooks.ts
CHANGED
|
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
|
|
| 27 |
'one',
|
| 28 |
'qa',
|
| 29 |
'manual',
|
| 30 |
-
'knowledge_graph'
|
| 31 |
],
|
| 32 |
],
|
| 33 |
[
|
|
@@ -67,7 +67,7 @@ const ParserListMap = new Map([
|
|
| 67 |
],
|
| 68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
| 69 |
[['json'], ['naive', 'knowledge_graph']],
|
| 70 |
-
[['eml'], ['email']]
|
| 71 |
]);
|
| 72 |
|
| 73 |
const getParserList = (
|
|
|
|
| 27 |
'one',
|
| 28 |
'qa',
|
| 29 |
'manual',
|
| 30 |
+
'knowledge_graph',
|
| 31 |
],
|
| 32 |
],
|
| 33 |
[
|
|
|
|
| 67 |
],
|
| 68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
| 69 |
[['json'], ['naive', 'knowledge_graph']],
|
| 70 |
+
[['eml'], ['email']],
|
| 71 |
]);
|
| 72 |
|
| 73 |
const getParserList = (
|
web/src/locales/en.ts
CHANGED
|
@@ -199,7 +199,7 @@ export default {
|
|
| 199 |
We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
|
| 200 |
So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
|
| 201 |
</p>`,
|
| 202 |
-
naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT</b>.</p>
|
| 203 |
<p>This method apply the naive ways to chunk files: </p>
|
| 204 |
<p>
|
| 205 |
<li>Successive text will be sliced into pieces using vision detection model.</li>
|
|
@@ -271,6 +271,13 @@ export default {
|
|
| 271 |
</p><p>
|
| 272 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
| 273 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
useRaptor: 'Use RAPTOR to enhance retrieval',
|
| 275 |
useRaptorTip:
|
| 276 |
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
|
|
|
| 199 |
We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
|
| 200 |
So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
|
| 201 |
</p>`,
|
| 202 |
+
naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>.</p>
|
| 203 |
<p>This method apply the naive ways to chunk files: </p>
|
| 204 |
<p>
|
| 205 |
<li>Successive text will be sliced into pieces using vision detection model.</li>
|
|
|
|
| 271 |
</p><p>
|
| 272 |
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
| 273 |
</p>`,
|
| 274 |
+
knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>
|
| 275 |
+
|
| 276 |
+
<p>After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files:
|
| 277 |
+
Successive text will be sliced into pieces each of which is around 512 token number.</p>
|
| 278 |
+
<p>Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.</p>
|
| 279 |
+
|
| 280 |
+
Mind the entiry type you need to specify.</p>`,
|
| 281 |
useRaptor: 'Use RAPTOR to enhance retrieval',
|
| 282 |
useRaptorTip:
|
| 283 |
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
web/src/locales/zh-traditional.ts
CHANGED
|
@@ -190,7 +190,7 @@ export default {
|
|
| 190 |
我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
|
| 191 |
因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
|
| 192 |
</p>`,
|
| 193 |
-
naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
|
| 194 |
<p>此方法將簡單的方法應用於塊文件:</p>
|
| 195 |
<p>
|
| 196 |
<li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
|
|
@@ -244,6 +244,13 @@ export default {
|
|
| 244 |
</p><p>
|
| 245 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
| 246 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
useRaptor: '使用RAPTOR文件增強策略',
|
| 248 |
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
| 249 |
prompt: '提示詞',
|
|
|
|
| 190 |
我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。
|
| 191 |
因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。
|
| 192 |
</p>`,
|
| 193 |
+
naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
|
| 194 |
<p>此方法將簡單的方法應用於塊文件:</p>
|
| 195 |
<p>
|
| 196 |
<li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li>
|
|
|
|
| 244 |
</p><p>
|
| 245 |
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
| 246 |
</p>`,
|
| 247 |
+
knowledgeGraph: `<p>支援的檔案格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
|
| 248 |
+
|
| 249 |
+
<p>文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案:
|
| 250 |
+
連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。
|
| 251 |
+
<p>接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。
|
| 252 |
+
|
| 253 |
+
<p>請注意您需要指定的條目類型。</p></p>`,
|
| 254 |
useRaptor: '使用RAPTOR文件增強策略',
|
| 255 |
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
| 256 |
prompt: '提示詞',
|
web/src/locales/zh.ts
CHANGED
|
@@ -191,7 +191,7 @@ export default {
|
|
| 191 |
我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
|
| 192 |
因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
|
| 193 |
</p>`,
|
| 194 |
-
naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p>
|
| 195 |
<p>此方法将简单的方法应用于块文件:</p>
|
| 196 |
<p>
|
| 197 |
<li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
|
|
@@ -261,6 +261,13 @@ export default {
|
|
| 261 |
</p><p>
|
| 262 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
| 263 |
</p>`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
useRaptor: '使用召回增强RAPTOR策略',
|
| 265 |
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
| 266 |
prompt: '提示词',
|
|
|
|
| 191 |
我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。
|
| 192 |
因此,同一部分中的图和表不会被分割,并且块大小可能会很大。
|
| 193 |
</p>`,
|
| 194 |
+
naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p>
|
| 195 |
<p>此方法将简单的方法应用于块文件:</p>
|
| 196 |
<p>
|
| 197 |
<li>系统将使用视觉检测模型将连续文本分割成多个片段。</li>
|
|
|
|
| 261 |
</p><p>
|
| 262 |
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
| 263 |
</p>`,
|
| 264 |
+
knowledgeGraph: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>
|
| 265 |
+
|
| 266 |
+
<p>文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件:
|
| 267 |
+
连续的文本将被切成大约 512 个 token 数的块。</p>
|
| 268 |
+
<p>接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。</p>
|
| 269 |
+
|
| 270 |
+
注意您需要指定的条目类型。</p>`,
|
| 271 |
useRaptor: '使用召回增强RAPTOR策略',
|
| 272 |
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
| 273 |
prompt: '提示词',
|
web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx
CHANGED
|
@@ -3,6 +3,7 @@ import { useTranslate } from '@/hooks/common-hooks';
|
|
| 3 |
import { useSelectParserList } from '@/hooks/user-setting-hooks';
|
| 4 |
import { Col, Divider, Empty, Row, Typography } from 'antd';
|
| 5 |
import DOMPurify from 'dompurify';
|
|
|
|
| 6 |
import { useMemo } from 'react';
|
| 7 |
import styles from './index.less';
|
| 8 |
import { ImageMap } from './utils';
|
|
@@ -18,7 +19,7 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => {
|
|
| 18 |
if (item) {
|
| 19 |
return {
|
| 20 |
title: item.label,
|
| 21 |
-
description: t(item.value),
|
| 22 |
};
|
| 23 |
}
|
| 24 |
return { title: '', description: '' };
|
|
|
|
| 3 |
import { useSelectParserList } from '@/hooks/user-setting-hooks';
|
| 4 |
import { Col, Divider, Empty, Row, Typography } from 'antd';
|
| 5 |
import DOMPurify from 'dompurify';
|
| 6 |
+
import camelCase from 'lodash/camelCase';
|
| 7 |
import { useMemo } from 'react';
|
| 8 |
import styles from './index.less';
|
| 9 |
import { ImageMap } from './utils';
|
|
|
|
| 19 |
if (item) {
|
| 20 |
return {
|
| 21 |
title: item.label,
|
| 22 |
+
description: t(camelCase(item.value)),
|
| 23 |
};
|
| 24 |
}
|
| 25 |
return { title: '', description: '' };
|
web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
CHANGED
|
@@ -37,6 +37,9 @@ export const useSubmitKnowledgeConfiguration = (form: FormInstance) => {
|
|
| 37 |
};
|
| 38 |
};
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
| 41 |
const parserList = useSelectParserList();
|
| 42 |
const allOptions = useSelectLlmOptionsByModelType();
|
|
@@ -62,7 +65,9 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
|
| 62 |
}, [form, knowledgeDetails]);
|
| 63 |
|
| 64 |
return {
|
| 65 |
-
parserList
|
|
|
|
|
|
|
| 66 |
embeddingModelOptions: allOptions[LlmModelType.Embedding],
|
| 67 |
disabled: knowledgeDetails.chunk_num > 0,
|
| 68 |
};
|
|
|
|
| 37 |
};
|
| 38 |
};
|
| 39 |
|
| 40 |
+
// The value that does not need to be displayed in the analysis method Select
|
| 41 |
+
const HiddenFields = ['email', 'picture', 'audio'];
|
| 42 |
+
|
| 43 |
export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
| 44 |
const parserList = useSelectParserList();
|
| 45 |
const allOptions = useSelectLlmOptionsByModelType();
|
|
|
|
| 65 |
}, [form, knowledgeDetails]);
|
| 66 |
|
| 67 |
return {
|
| 68 |
+
parserList: parserList.filter(
|
| 69 |
+
(x) => !HiddenFields.some((y) => y === x.value),
|
| 70 |
+
),
|
| 71 |
embeddingModelOptions: allOptions[LlmModelType.Embedding],
|
| 72 |
disabled: knowledgeDetails.chunk_num > 0,
|
| 73 |
};
|
web/src/pages/add-knowledge/components/knowledge-setting/utils.ts
CHANGED
|
@@ -15,6 +15,7 @@ export const ImageMap = {
|
|
| 15 |
resume: getImageName('resume', 2),
|
| 16 |
table: getImageName('table', 2),
|
| 17 |
one: getImageName('one', 2),
|
|
|
|
| 18 |
};
|
| 19 |
|
| 20 |
export const TextMap = {
|
|
|
|
| 15 |
resume: getImageName('resume', 2),
|
| 16 |
table: getImageName('table', 2),
|
| 17 |
one: getImageName('one', 2),
|
| 18 |
+
knowledge_graph: getImageName('knowledge-graph', 2),
|
| 19 |
};
|
| 20 |
|
| 21 |
export const TextMap = {
|