Agenten-Plattform/ums/utils/schema.py

# Agenten Plattform
#
# (c) 2024 Magnus Bender
# 	Institute of Humanities-Centered Artificial Intelligence (CHAI)
# 	Universitaet Hamburg
# 	https://www.chai.uni-hamburg.de/~bender
#
# source code released under the terms of GNU Public License Version 3
# https://www.gnu.org/licenses/gpl-3.0.txt

"""
	This represents the basic types used for representing extracted information from the data.
	The types are implemented using [pydantic](https://docs.pydantic.dev/).
	It provides validation, allow JSON serialization and works well with [FastAPI](https://fastapi.tiangolo.com/) which is used internally for the http request between the agents and the management.

	**This is work in progress!**
"""

from typing import List, Any, Dict

from pydantic import BaseModel

class ExtractionSchema(BaseModel):
	"""
		This is the basic class used as superclass for all extracted information from data items.

		For all the `ExtractionSchema` is is required that the data can be serialized to json.
		Thus, mostly only default data types like `int, str, bool, list, dict, tuple` also including `ExtractionSchema` and `RiddleInformation` can be used here!
	"""

class ExtractedContent(ExtractionSchema):
	"""
		An extracted content item.
	"""

	type : str
	"""
		The type, as a string, the actual string will depend on the extraction agent.
	"""

	content : str | Any
	"""
		The extracted content
	"""

class ExtractedPositions(ExtractionSchema):
	"""
		A position (like time, coordinates, ...) where something was extracted (each position should belong to a content item).
	"""

	type : str
	"""
		The type, as a string, the actual string will depend on the extraction agent.
	"""

	position : str | int | Any
	"""
		The position, will also depend on the extraction agent.
	"""

	description : str | Any = None
	"""
		An optional description for more details.
	"""

class ExtractedData(ExtractionSchema):
	"""
		Contains the extracted items from a data file.
	"""

	contents : List[ExtractedContent] = []
	"""
		The extracted contents (i.e., transcriptions etc.), each item here should belong a position item at the same index.
	"""

	positions : List[ExtractedPositions] = []
	"""
		The positions of extracted contents, each item here should belong a content item at the same index.
	"""

	other : Dict[str, Any] = {}
	"""
		Possibly more data. Use a keywords (depending on agent) and store the data there.
	"""