Step 1: Setup¶
Project structure and template selection.
Goal¶
Set up a robust project structure for maintaining a domain-specific knowledge abstract.
Project Structure¶
Create the following directory structure:
Structure:
my_ka/
├── config.yaml # Project configuration
├── documents/ # Source documents
│ ├── raw/ # Original files
│ └── processed/ # Processing log
├── ka/ # Knowledge base versions
│ └── current/ # Current version (symlink)
├── backups/ # Version backups
├── logs/ # Operation logs
└── kb_manager.py # Management script
Configuration¶
config.yaml¶
# Knowledge Abstract Configuration
name: "Company Knowledge Abstract"
domain: "legal" # or finance, medical, etc.
# Template settings
template: "legal/contract_obligation" # Choose based on domain
language: "en"
# Processing settings
chunk_size: 2048
max_workers: 10
# Versioning
version_format: "v{major}.{minor}"
auto_backup: true
# Paths
documents_dir: "./documents/raw"
ka_dir: "./ka"
backup_dir: "./backups"
Template Selection Guide¶
| Domain | Recommended Template | Auto-Type |
|---|---|---|
| Legal | legal/contract_obligation |
list |
| Finance | finance/earnings_summary |
model |
| Medical | medicine/anatomy_graph |
graph |
| General | general/graph |
graph |
| Research | general/concept_graph |
graph |
Management Script¶
kb_manager.py (Starter)¶
"""Knowledge Abstract Manager."""
import yaml
import json
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import Optional
from hyperextract import Template
@dataclass
class KBConfig:
name: str
domain: str
template: str
language: str
documents_dir: str
ka_dir: str
backup_dir: str
class KnowledgeBaseManager:
def __init__(self, config_path: str = "config.yaml"):
self.config = self._load_config(config_path)
self.ka = None
self.current_kb = None
def _load_config(self, path: str) -> KBConfig:
"""Load configuration from YAML."""
with open(path) as f:
data = yaml.safe_load(f)
return KBConfig(**data)
def initialize(self):
"""Initialize knowledge abstract."""
print(f"Initializing: {self.config.name}")
# Create directories
Path(self.config.documents_dir).mkdir(parents=True, exist_ok=True)
Path(self.config.kb_dir).mkdir(parents=True, exist_ok=True)
Path(self.config.backup_dir).mkdir(parents=True, exist_ok=True)
# Create template
self.ka = Template.create(
self.config.template,
language=self.config.language
)
print("✓ Initialization complete")
def get_version_path(self, version: Optional[str] = None) -> Path:
"""Get path for a specific version."""
if version is None:
version = datetime.now().strftime("v%Y%m%d_%H%M%S")
return Path(self.config.kb_dir) / version
def save_version(self, kb_instance, version: Optional[str] = None):
"""Save knowledge abstract version."""
path = self.get_version_path(version)
kb_instance.dump(str(path))
# Update current symlink
current_link = Path(self.config.kb_dir) / "current"
if current_link.exists():
current_link.unlink()
current_link.symlink_to(path, target_is_directory=True)
print(f"✓ Saved version: {path.name}")
return path
# Usage
if __name__ == "__main__":
manager = KnowledgeBaseManager()
manager.initialize()
Initialization¶
1. Create Config¶
cat > config.yaml << 'EOF'
name: "My Knowledge Abstract"
domain: "general"
template: "general/graph"
language: "en"
documents_dir: "./documents/raw"
ka_dir: "./ka"
backup_dir: "./backups"
EOF
2. Run Setup¶
Expected output:
3. Verify Structure¶
Template Testing¶
Before ingesting documents, test your template:
# test_template.py
from hyperextract import Template
# Load config
import yaml
with open("config.yaml") as f:
config = yaml.safe_load(f)
# Test template
ka = Template.create(config["template"], config["language"])
# Test with sample text
test_text = "This is a test document for template validation."
result = ka.parse(test_text)
print(f"Template: {config['template']}")
print(f"Nodes: {len(result.nodes)}")
print(f"Edges: {len(result.edges)}")