do a better job of extracting context

2025-03-10 18:53:36 +07:00 · 2025-03-10 18:53:36 +07:00 · f482b3b4c8
parent c386e34c33
commit f482b3b4c8
3 changed files with 308 additions and 15 deletions
--- a/src/services/llm/context_extractor.ts
+++ b/src/services/llm/context_extractor.ts
@ -102,6 +102,107 @@ export class ContextExtractor {
                // Format code notes with code blocks
                formattedContent += '```\n' + content + '\n```';
                break;
+            case 'canvas':
+            case 'mindMap':
+            case 'relationMap':
+            case 'geoMap':
+                if (mime === 'application/json') {
+                    try {
+                        // Parse JSON content
+                        const jsonContent = JSON.parse(content);
+
+                        if (type === 'canvas') {
+                            // Extract text elements from canvas
+                            if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
+                                const texts = jsonContent.elements
+                                    .filter((element: any) => element.type === 'text' && element.text)
+                                    .map((element: any) => element.text);
+
+                                formattedContent += 'Canvas content:\n' + texts.join('\n');
+                                break;
+                            }
+                        }
+                        else if (type === 'mindMap') {
+                            // Extract node text from mind map
+                            const extractMindMapNodes = (node: any): string[] => {
+                                let texts: string[] = [];
+                                if (node.text) {
+                                    texts.push(node.text);
+                                }
+                                if (node.children && Array.isArray(node.children)) {
+                                    for (const child of node.children) {
+                                        texts = texts.concat(extractMindMapNodes(child));
+                                    }
+                                }
+                                return texts;
+                            };
+
+                            if (jsonContent.root) {
+                                formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
+                                break;
+                            }
+                        }
+                        else if (type === 'relationMap') {
+                            // Extract relation map entities and connections
+                            let result = 'Relation map content:\n';
+
+                            if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
+                                result += 'Notes: ' + jsonContent.notes
+                                    .map((note: any) => note.title || note.name)
+                                    .filter(Boolean)
+                                    .join(', ') + '\n';
+                            }
+
+                            if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
+                                result += 'Relations: ' + jsonContent.relations
+                                    .map((rel: any) => {
+                                        const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
+                                        const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
+                                        const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
+                                        const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
+                                        return `${source} → ${rel.name || ''} → ${target}`;
+                                    })
+                                    .join('; ');
+                            }
+
+                            formattedContent += result;
+                            break;
+                        }
+                        else if (type === 'geoMap') {
+                            let result = 'Geographic map content:\n';
+
+                            if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
+                                result += jsonContent.markers
+                                    .map((marker: any) => {
+                                        return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
+                                    })
+                                    .join('\n');
+                            }
+
+                            formattedContent += result || 'Empty geographic map';
+                            break;
+                        }
+                    }
+                    catch (e: any) {
+                        formattedContent += `[Error parsing ${type} content: ${e.message}]`;
+                        break;
+                    }
+                }
+
+                // If JSON parsing or specific handling failed, use default handling
+                formattedContent += `[${type} content]`;
+                break;
+
+            case 'mermaid':
+                // Format mermaid diagrams as code blocks
+                formattedContent += '```mermaid\n' + content + '\n```';
+                break;
+
+            case 'image':
+            case 'file':
+                formattedContent += `[${type} attachment]`;
+                break;
+
            default:
                // For other notes, just use the content as is
                formattedContent += this.sanitizeHtml(content);
@ -114,7 +215,10 @@ export class ContextExtractor {
     * Sanitize HTML content to plain text
     */
    private sanitizeHtml(html: string): string {
-        return sanitizeHtml(html, {
+        if (!html) return '';
+
+        // Use sanitizeHtml to remove all HTML tags
+        let content = sanitizeHtml(html, {
            allowedTags: [],
            allowedAttributes: {},
            textFilter: (text) => {
@ -122,6 +226,17 @@ export class ContextExtractor {
                return text.replace(/\n\s*\n/g, '\n\n');
            }
        });
+
+        // Additional cleanup for any remaining HTML entities
+        content = content
+            .replace(/&nbsp;/g, ' ')
+            .replace(/&lt;/g, '<')
+            .replace(/&gt;/g, '>')
+            .replace(/&amp;/g, '&')
+            .replace(/&quot;/g, '"')
+            .replace(/&#39;/g, "'");
+
+        return content;
    }

    /**
--- a/src/services/llm/embeddings/vector_store.ts
+++ b/src/services/llm/embeddings/vector_store.ts
@ -8,6 +8,7 @@ import type { NoteEmbeddingContext } from "./embeddings_interface.js";
 import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js";
 import eventService from "../../events.js";
 import type BNote from "../../../becca/entities/bnote.js";
+import sanitizeHtml from "sanitize-html";

 // Type definition for embedding result
 interface EmbeddingResult {
@ -183,19 +184,37 @@ export async function findSimilarNotes(
 * Clean note content by removing HTML tags and normalizing whitespace
 */
 function cleanNoteContent(content: string, type: string, mime: string): string {
+    if (!content) return '';
+
    // If it's HTML content, remove HTML tags
    if ((type === 'text' && mime === 'text/html') || content.includes('<div>') || content.includes('<p>')) {
-        // Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser
-        content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space
+        // Use sanitizeHtml to remove all HTML tags
+        content = sanitizeHtml(content, {
+            allowedTags: [],
+            allowedAttributes: {},
+            textFilter: (text) => {
+                // Normalize the text, removing excessive whitespace
+                return text.replace(/\s+/g, ' ');
+            }
+        });
    }

+    // Additional cleanup for any remaining HTML entities
+    content = content
+        .replace(/&nbsp;/g, ' ')
+        .replace(/&lt;/g, '<')
+        .replace(/&gt;/g, '>')
+        .replace(/&amp;/g, '&')
+        .replace(/&quot;/g, '"')
+        .replace(/&#39;/g, "'");
+
    // Normalize whitespace (replace multiple spaces/newlines with single space)
    content = content.replace(/\s+/g, ' ');

    // Trim the content
    content = content.trim();

-    // Truncate if extremely long (optional, adjust limit as needed)
+    // Truncate if extremely long
    const MAX_CONTENT_LENGTH = 10000;
    if (content.length > MAX_CONTENT_LENGTH) {
        content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]';
@ -204,6 +223,113 @@ function cleanNoteContent(content: string, type: string, mime: string): string {
    return content;
 }

+/**
+ * Extract content from different note types
+ */
+function extractStructuredContent(content: string, type: string, mime: string): string {
+    try {
+        if (!content) return '';
+
+        // Special handling based on note type
+        switch (type) {
+            case 'mindMap':
+            case 'relationMap':
+            case 'canvas':
+                if (mime === 'application/json') {
+                    const jsonContent = JSON.parse(content);
+
+                    if (type === 'canvas') {
+                        // Extract text elements from canvas
+                        if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
+                            const texts = jsonContent.elements
+                                .filter((element: any) => element.type === 'text' && element.text)
+                                .map((element: any) => element.text);
+                            return texts.join('\n');
+                        }
+                    }
+                    else if (type === 'mindMap') {
+                        // Extract node text from mind map
+                        const extractMindMapNodes = (node: any): string[] => {
+                            let texts: string[] = [];
+                            if (node.text) {
+                                texts.push(node.text);
+                            }
+                            if (node.children && Array.isArray(node.children)) {
+                                for (const child of node.children) {
+                                    texts = texts.concat(extractMindMapNodes(child));
+                                }
+                            }
+                            return texts;
+                        };
+
+                        if (jsonContent.root) {
+                            return extractMindMapNodes(jsonContent.root).join('\n');
+                        }
+                    }
+                    else if (type === 'relationMap') {
+                        // Extract relation map entities and connections
+                        let result = '';
+
+                        if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
+                            result += 'Notes: ' + jsonContent.notes
+                                .map((note: any) => note.title || note.name)
+                                .filter(Boolean)
+                                .join(', ') + '\n';
+                        }
+
+                        if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
+                            result += 'Relations: ' + jsonContent.relations
+                                .map((rel: any) => {
+                                    const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
+                                    const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
+                                    const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
+                                    const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
+                                    return `${source} → ${rel.name || ''} → ${target}`;
+                                })
+                                .join('; ');
+                        }
+
+                        return result;
+                    }
+                }
+                return JSON.stringify(content);
+
+            case 'mermaid':
+                // Return mermaid diagrams as-is (they're human-readable)
+                return content;
+
+            case 'geoMap':
+                if (mime === 'application/json') {
+                    const jsonContent = JSON.parse(content);
+                    let result = '';
+
+                    if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
+                        result += jsonContent.markers
+                            .map((marker: any) => {
+                                return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
+                            })
+                            .join('\n');
+                    }
+
+                    return result || JSON.stringify(content);
+                }
+                return JSON.stringify(content);
+
+            case 'file':
+            case 'image':
+                // For files and images, just return a placeholder
+                return `[${type} attachment]`;
+
+            default:
+                return content;
+        }
+    }
+    catch (error) {
+        console.error(`Error extracting content from ${type} note:`, error);
+        return content;
+    }
+}
+
 /**
 * Gets context for a note to be embedded
 */
@ -282,12 +408,23 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed

    // Get content
    let content = "";
-    if (note.type === 'text') {
-        content = String(await note.getContent());
-    } else if (note.type === 'code') {
-        content = String(await note.getContent());
-    } else if (note.type === 'image' || note.type === 'file') {
-        content = `[${note.type} attachment: ${note.mime}]`;
+
+    try {
+        // Get raw content from the note
+        const rawContent = String(await note.getContent() || "");
+
+        // Process the content based on note type to extract meaningful text
+        if (note.type === 'text' || note.type === 'code') {
+            content = rawContent;
+        } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
+            // Process structured content types
+            content = extractStructuredContent(rawContent, note.type, note.mime);
+        } else if (note.type === 'image' || note.type === 'file') {
+            content = `[${note.type} attachment: ${note.mime}]`;
+        }
+    } catch (err) {
+        console.error(`Error getting content for note ${noteId}:`, err);
+        content = `[Error extracting content]`;
    }

    // Clean the content to remove HTML tags and normalize whitespace
--- a/src/services/llm/trilium_context_service.ts
+++ b/src/services/llm/trilium_context_service.ts
@ -5,6 +5,7 @@ import options from "../options.js";
 import log from "../log.js";
 import type { Message } from "./ai_interface.js";
 import { cosineSimilarity } from "./embeddings/vector_store.js";
+import sanitizeHtml from "sanitize-html";

 /**
 * TriliumContextService provides intelligent context management for working with large knowledge bases
@ -351,15 +352,16 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
            context += `--- NOTE ${index + 1}: ${source.title} ---\n`;

            if (source.content) {
+                // Clean up HTML content before adding it to the context
+                let cleanContent = this.sanitizeNoteContent(source.content, source.type, source.mime);
+
                // Truncate content if it's too long
                const maxContentLength = 1000;
-                let content = source.content;
-
-                if (content.length > maxContentLength) {
-                    content = content.substring(0, maxContentLength) + " [content truncated due to length]";
+                if (cleanContent.length > maxContentLength) {
+                    cleanContent = cleanContent.substring(0, maxContentLength) + " [content truncated due to length]";
                }

-                context += `${content}\n`;
+                context += `${cleanContent}\n`;
            } else {
                context += "[This note doesn't contain textual content]\n";
            }
@ -373,6 +375,45 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
        return context;
    }

+    /**
+     * Sanitize note content for use in context, removing HTML tags
+     */
+    private sanitizeNoteContent(content: string, type?: string, mime?: string): string {
+        if (!content) return '';
+
+        // If it's likely HTML content
+        if (
+            (type === 'text' && mime === 'text/html') ||
+            content.includes('<div') ||
+            content.includes('<p>') ||
+            content.includes('<span')
+        ) {
+            // Use sanitizeHtml to remove all HTML tags
+            content = sanitizeHtml(content, {
+                allowedTags: [],
+                allowedAttributes: {},
+                textFilter: (text) => {
+                    // Replace multiple newlines with a single one
+                    return text.replace(/\n\s*\n/g, '\n\n');
+                }
+            });
+
+            // Additional cleanup for remaining HTML entities
+            content = content
+                .replace(/&nbsp;/g, ' ')
+                .replace(/&lt;/g, '<')
+                .replace(/&gt;/g, '>')
+                .replace(/&amp;/g, '&')
+                .replace(/&quot;/g, '"')
+                .replace(/&#39;/g, "'");
+        }
+
+        // Normalize whitespace
+        content = content.replace(/\s+/g, ' ').trim();
+
+        return content;
+    }
+
    /**
     * Process a user query with the Trilium-specific approach:
     * 1. Generate search queries from the original question