Connection

geneva.db.Connection

Bases: DBConnection

Geneva Connection.

Source code in geneva/db.py

class Connection(DBConnection):
    """Geneva Connection."""

    def __init__(
        self,
        uri: str,
        *,
        region: str = "us-east-1",
        api_key: Credential | None = None,
        host_override: str | None = None,
        storage_options: dict[str, str] | None = None,
        checkpoint_store: CheckpointStore | None = None,
        packager: UDFPackager | None = None,
        namespace_impl: str | None = None,
        namespace_properties: dict[str, str] | None = None,
        system_namespace: list[str],
        **kwargs,
    ) -> None:
        super().__init__()

        self._uri = uri
        self._region = region
        self._api_key = api_key
        self._host_override = host_override
        self._storage_options = storage_options
        self._ldb: DBConnection | None = None
        self._checkpoint_store = checkpoint_store
        self._packager = packager or DockerUDFPackager()
        self.namespace_impl = namespace_impl
        self.namespace_properties = namespace_properties
        # Default system_namespace to [] if not using namespace
        self.system_namespace = system_namespace if namespace_impl else []

        self._jobs_manager: JobStateManager | None = None
        self._cluster_manager: ClusterConfigManager | None = None
        self._manifest_manager: ManifestConfigManager | None = None
        self._flight_client: FlightSQLClient | None = None
        self._namespace_connection: LanceNamespaceDBConnection | None = None
        self._kwargs = kwargs

    def __repr__(self) -> str:
        return f"<Geneva uri={self.uri}>"

    def __getstate__(self) -> dict:
        return {
            "uri": self._uri,
            "api_key": self._api_key,
            "host_override": self._host_override,
            "storage_options": self._storage_options,
            "region": self._region,
            "system_namespace": self.system_namespace,
        }

    def __setstate__(self, state) -> None:
        self.__init__(state.pop("uri"), **state)

    def __enter__(self) -> "Connection":
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()
        return None  # Don't suppress exceptions

    def close(self) -> None:
        """Close the connection."""
        if self._flight_client is not None:
            self._flight_client.close()
        if self._ldb is not None and hasattr(self._ldb, "_conn"):
            # go to the async client and eagerly close the connection
            self._ldb._conn.close()  # type: ignore[attr-defined]

    @cached_property
    def namespace_client(self) -> LanceNamespace | None:
        """Returns namespace client if using namespace connection."""
        if self.namespace_impl is not None and self.namespace_properties is not None:
            return namespace_connect(self.namespace_impl, self.namespace_properties)
        return None

    @cached_property
    def _connect(self) -> DBConnection:
        """Returns the underlying lancedb connection. If the connection
        was initialized using a namespace, this will return the
        LanceNamespaceDBConnection.
        """

        # use the namespace connection if set
        if self.namespace_impl is not None:
            if not self._namespace_connection:
                assert self.namespace_properties is not None, (
                    "namespace_properties must be set when namespace_impl is set"
                )
                namespace_client = namespace_connect(
                    self.namespace_impl, self.namespace_properties
                )

                self._namespace_connection = LanceNamespaceDBConnection(
                    namespace_client,
                    storage_options=self._storage_options,
                    **self._kwargs,
                )
            _LOG.info(f"using namespace connection: {self._namespace_connection=}")
            return self._namespace_connection

        if self._ldb is None:
            self._ldb = lancedb.connect(
                self.uri,
                region=self._region,
                api_key=self._api_key,
                host_override=self._host_override,
                storage_options=self._storage_options,
                **self._kwargs,
            )
        return self._ldb

    @cached_property
    def _history(self) -> "JobStateManager":  # noqa: F821
        """Returns a JobStateManager that persists job executions and statuses"""
        from geneva.jobs import JobStateManager

        if self._jobs_manager is None:
            self._jobs_manager = JobStateManager(self, namespace=self.system_namespace)
        return self._jobs_manager

    @cached_property
    def flight_client(self) -> "FlightSQLClient":
        from flightsql import FlightSQLClient

        if self._flight_client is not None:
            return self._flight_client
        url = urlparse(self._host_override)
        hostname = url.hostname
        client = FlightSQLClient(
            host=hostname,
            port=10025,
            token="DATABASE_TOKEN",  # Dummy auth, not plugged in yet
            metadata={"database": self.uri},  # Name of the project-id
            features={"metadata-reflection": "true"},
            insecure=True,  # or False, up to you
        )
        self._flight_client = client
        return client

    @override
    def table_names(
        self, page_token: str | None = None, limit: int | None = None, *args, **kwargs
    ) -> Iterable[str]:
        """List all available tables and views."""
        return self._connect.table_names(
            *args, page_token=page_token, limit=limit or 10, **kwargs
        )

    @override
    def open_table(
        self,
        name: str,
        storage_options: dict[str, str] | None = None,
        index_cache_size: int | None = None,
        version: int | None = None,
        namespace: list[str] | None = None,
        *args,
        **kwargs,
    ) -> "Table":
        """Open a Lance Table.

        Parameters
        ----------
        name: str
            Name of the table.
        storage_options: dict[str, str], optional
            Additional options for the storage backend.
            Options already set on the connection will be inherited by the table,
            but can be overridden here. See available options at
            [https://lancedb.github.io/lancedb/guides/storage/](https://lancedb.github.io/lancedb/guides/storage/)
        namespace: list[str], optional
            Namespace path for the table (e.g., ["workspace"] for workspace.t)

        """
        from .table import Table

        storage_options = storage_options or self._storage_options
        namespace = namespace if namespace is not None else []

        return Table(
            self,
            name,
            namespace=namespace,
            index_cache_size=index_cache_size,
            storage_options=storage_options,
            version=version,
        )

    def _validate_existing_table_has_stable_row_ids(self, name: str) -> None:
        """Validate that an existing table has stable row IDs enabled.

        Called when exist_ok=True and stable row IDs are requested, to ensure
        the existing table matches the requested configuration.

        Raises:
            ValueError: If table exists with data but doesn't have stable row IDs
        """
        # Use lancedb table API (not lance.dataset directly) to support namespaces
        if name not in self._connect.table_names():
            return

        try:
            table = self._connect.open_table(name)
            fragments = list(table.to_lance().get_fragments())  # type: ignore[union-attr]
            if fragments:
                if not has_stable_row_ids(fragments):
                    raise ValueError(
                        f"Cannot open table '{name}' with exist_ok=True: "
                        f"table exists but does not have stable row IDs enabled.\n\n"
                        f"You requested stable row IDs, but the existing table "
                        f"was created without them.\n\n"
                        f"Options:\n"
                        f"  1. Drop and recreate the table with stable row IDs\n"
                        f"  2. Remove storage_options if stable row IDs are "
                        f"not required\n"
                        f"  3. Use mode='overwrite' to replace the table"
                    )
            else:
                _LOG.warning(
                    f"Table '{name}' exists but is empty - cannot verify "
                    f"if stable row IDs are enabled."
                )
        except ValueError:
            raise
        except Exception as e:
            _LOG.debug(
                f"Could not check stable row ID status for existing table '{name}': {e}"
            )

    @override
    def create_table(  # type: ignore
        self,
        name: str,
        data: DATA | None = None,
        schema: pa.Schema | LanceModel | None = None,
        mode: str = "create",
        exist_ok: bool = False,
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
        storage_options: dict[str, str] | None = None,
        *args,
        **kwargs,
    ) -> "Table":  # type: ignore
        """Create a Table in the lake

        Parameters
        ----------
        name: str
            The name of the table
        data: The data to initialize the table, *optional*
            User must provide at least one of `data` or `schema`.
            Acceptable types are:

            - list-of-dict
            - pandas.DataFrame
            - pyarrow.Table or pyarrow.RecordBatch
        schema: The schema of the table, *optional*
            Acceptable types are:

            - pyarrow.Schema
            - [LanceModel][lancedb.pydantic.LanceModel]
        mode: str; default "create"
            The mode to use when creating the table.
            Can be either "create" or "overwrite".
            By default, if the table already exists, an exception is raised.
            If you want to overwrite the table, use mode="overwrite".
        exist_ok: bool, default False
            If a table by the same name already exists, then raise an exception
            if exist_ok=False. If exist_ok=True, then open the existing table;
            it will not add the provided data but will validate against any
            schema that's specified.
        on_bad_vectors: str, default "error"
            What to do if any of the vectors are not the same size or contain NaNs.
            One of "error", "drop", "fill".
        """
        from .table import Table

        # Handle new_table_enable_stable_row_ids validation and normalization
        # Accept both string "true" and boolean True for user convenience
        if storage_options:
            enable_stable = storage_options.get("new_table_enable_stable_row_ids")
            if enable_stable in ("true", True, 1):
                # Normalize to string "true" for lancedb
                storage_options = dict(storage_options)
                storage_options["new_table_enable_stable_row_ids"] = "true"

                # Validate exist_ok: ensure existing table has stable row IDs
                if exist_ok:
                    self._validate_existing_table_has_stable_row_ids(name)

        # Create table through lancedb
        self._connect.create_table(
            name,
            data,
            schema,
            mode,
            *args,
            exist_ok=exist_ok,
            on_bad_vectors=on_bad_vectors,
            fill_value=fill_value,
            storage_options=storage_options,
            **kwargs,
        )
        # Extract namespace from kwargs to pass to Table
        namespace = kwargs.get("namespace")
        return Table(self, name, namespace=namespace, storage_options=storage_options)

    def create_view(
        self,
        name: str,
        query: str,
        materialized: bool = False,
    ) -> "Table":
        """Create a View from a Query.

        Parameters
        ----------
        name: str
            Name of the view.
        query: str
            SQL query to create the view.
        materialized: bool, optional
            If True, the view is materialized.
        """
        if materialized:
            # idea, rename the provided name, and use it as the basis for the
            # materialized view.
            # - how do we add the udfs to the final materialized view table?
            NotImplementedError(
                "creating materialized view via sql query is not supported yet."
            )

        # TODO add test coverage here
        self.sql(f"CREATE VIEW {name} AS ({query})")
        return self.open_table(name)

    def create_materialized_view(
        self,
        name: str,
        query: "GenevaQueryBuilder",
        with_no_data: bool = True,
    ) -> "Table":
        """
        Create a materialized view

        Parameters
        ----------
        name: str
            Name of the materialized view.
        query: GenevaQueryBuilder
            Query to create the view.
        with_no_data: bool, optional
            If True, the view is materialized, if false it is ready for refresh.
        """
        from geneva.query import GenevaQueryBuilder

        if not isinstance(query, GenevaQueryBuilder):
            raise ValueError(
                "Materialized views only support plain queries (where, select)"
            )

        tbl = query.create_materialized_view(self, name)
        if not with_no_data and hasattr(tbl, "refresh_view"):
            tbl.refresh_view(name)  # type: ignore[attr-defined]

        return tbl

    def drop_view(self, name: str) -> pa.Table:
        """Drop a view."""
        return self.sql(f"DROP VIEW {name}")

    @override
    def drop_table(self, name: str, *args, **kwargs) -> None:
        """Drop a table."""
        self._connect.drop_table(name, *args, **kwargs)

    def define_cluster(self, name: str, cluster: "GenevaCluster") -> None:  # noqa: F821
        """
        Define a persistent Geneva cluster. This will upsert the cluster definition by
        name. The cluster can then be provisioned using `context(cluster=name)`.

        Parameters
        ----------
        name: str
            Name of the cluster. This will be used as the key when upserting and
            provisioning the cluster. The cluster name must comply with RFC 12123.
        cluster: GenevaCluster
            The cluster definition to store.
        """
        from geneva.cluster.mgr import ClusterConfigManager

        if self._cluster_manager is None:
            self._cluster_manager = ClusterConfigManager(
                self, namespace=self.system_namespace
            )

        cluster.name = name
        cluster.validate()
        self._cluster_manager.upsert(cluster)

    def list_clusters(self) -> list["GenevaCluster"]:  # noqa: F821
        """
        List the cluster definitions. These can be defined using `define_cluster()`.

        Returns
        -------
        Iterable of GenevaCluster
            List of Geneva cluster definitions
        """
        from geneva.cluster.mgr import ClusterConfigManager

        if self._cluster_manager is None:
            self._cluster_manager = ClusterConfigManager(
                self, namespace=self.system_namespace
            )
        return self._cluster_manager.list()

    def delete_cluster(self, name: str) -> None:  # noqa: F821
        """
        Delete a Geneva cluster definition.

        Parameters
        ----------
        name: str
            Name of the cluster to delete.
        """
        from geneva.cluster.mgr import ClusterConfigManager

        if self._cluster_manager is None:
            self._cluster_manager = ClusterConfigManager(
                self, namespace=self.system_namespace
            )

        self._cluster_manager.delete(name)

    def define_manifest(
        self,
        name: str,
        manifest: "GenevaManifest",  # noqa: F821
        uploader: Uploader | None = None,
    ) -> None:
        """
        Define a persistent Geneva Manifest that represents the files and dependencies
        used in the execution environment. This will upsert the manifest definition by
        name and upload the required artifacts. The manifest can then be used with
        `context(manifest=name)`.

        Parameters
        ----------
        name: str
            Name of the manifest. This will be used as the key when upserting and
            loading the manifest.
        manifest: GenevaManifest
            The manifest definition to use.
        uploader: Uploader, optional
            An optional, custom Uploader to use. If not provided, the uploader will be
            auto-detected based on the
            environment configuration.
        """
        from geneva.manifest.mgr import ManifestConfigManager

        if self._manifest_manager is None:
            self._manifest_manager = ManifestConfigManager(
                self, namespace=self.system_namespace
            )

        # Ensure the manifest table exists before creating Uploader
        # This guarantees the table location can be queried
        _ = self._manifest_manager.get_table()

        # If no uploader is provided, create one with manifest table context
        if uploader is None:
            from geneva.manifest.mgr import MANIFEST_TABLE_NAME

            if self.namespace_impl and self.namespace_properties:
                # Create uploader for namespace table
                # Build table_id for manifest table
                table_id = self.system_namespace + [MANIFEST_TABLE_NAME]
                uploader = Uploader(
                    namespace_impl=self.namespace_impl,
                    namespace_properties=self.namespace_properties,
                    table_id=table_id,
                )
            else:
                # For local databases, use database-level manifest table
                # Build table_id with just the table name
                table_id = [MANIFEST_TABLE_NAME]
                uploader = Uploader(
                    db_uri=self.uri,
                    table_id=table_id,
                )

        with upload_local_env(
            # todo: implement excludes
            uploader=uploader,
            zip_output_dir=manifest.local_zip_output_dir,
            delete_local_zips=manifest.delete_local_zips,
            skip_site_packages=manifest.skip_site_packages,
        ) as zips:
            m = copy.deepcopy(manifest)
            m.name = name
            m.zips = zips
            m.checksum = manifest.compute_checksum()
            self._manifest_manager.upsert(m)

    def list_manifests(self) -> list["GenevaManifest"]:  # noqa: F821
        """
        List the manifest definitions. These can be defined using `define_manifest()`.

        Returns
        -------
        Iterable of GenevaManifest
            List of Geneva manifest definitions
        """
        from geneva.manifest.mgr import ManifestConfigManager

        if self._manifest_manager is None:
            self._manifest_manager = ManifestConfigManager(
                self, namespace=self.system_namespace
            )
        return self._manifest_manager.list()

    def delete_manifest(self, name: str) -> None:  # noqa: F821
        """
        Delete a Geneva manifest definition.

        Parameters
        ----------
        name: str
            Name of the manifest to delete.
        """
        from geneva.manifest.mgr import ManifestConfigManager

        if self._manifest_manager is None:
            self._manifest_manager = ManifestConfigManager(
                self, namespace=self.system_namespace
            )

        self._manifest_manager.delete(name)

    def context(
        self,
        cluster: str | None = None,
        manifest: str | None = None,
        cluster_type=GenevaClusterType.KUBE_RAY,
        on_exit=None,
        log_to_driver: bool = True,
        logging_level=logging.INFO,
    ) -> contextlib.AbstractContextManager[None]:
        """Context manager for a Geneva Execution Environment.
            This will provision a cluster based on the cluster
            definition and the manifest provided.
            By default, the context manager will delete the cluster on exit.
            This can be configured with the on_exit parameter.
        Parameters
        ----------
        cluster: str
            Name of the persisted cluster definition to use. This will
            raise an exception if the cluster definition was not
            defined via `define_cluster()`. This parameter is ignored
            if `cluster_type` is `GenevaClusterType.LOCAL_RAY`.
        manifest: str
            Name of the persisted manifest to use. This will
            raise an exception if the manifest definition was not
            defined via `define_manifest()`.
        cluster_type: GenevaClusterType, optional, default GenevaClusterType.KUBE_RAY
            Type of the cluster to use. By default, KUBE_RAY will be used.
            To start a local Ray cluster, use `GenevaClusterType.LOCAL_RAY`.
        on_exit: ExitMode, optional, default ExitMode.DELETE
            Exit mode for the cluster. By default, the cluster will be deleted when the
            context manager exits.
            To retain the cluster on errors, use `ExitMode.DELETE_ON_SUCCESS`.
            To always retain the cluster, use `ExitMode.RETAIN`.
        log_to_driver: bool, optional, default True
            Whether to send Ray worker logs to the driver. Defaults to True for
            better visibility in tests and debugging.
        logging_level: int, optional, default logging.INFO
            The logging level for Ray workers. Use logging.DEBUG for detailed logs.
        """
        from geneva.cluster.mgr import ClusterConfigManager
        from geneva.manifest.mgr import ManifestConfigManager
        from geneva.runners.ray._mgr import ray_cluster
        from geneva.runners.ray.raycluster import ExitMode

        if self._cluster_manager is None:
            self._cluster_manager = ClusterConfigManager(
                self, namespace=self.system_namespace
            )
        if self._manifest_manager is None:
            self._manifest_manager = ManifestConfigManager(
                self, namespace=self.system_namespace
            )

        if GenevaClusterType(cluster_type) == GenevaClusterType.LOCAL_RAY:
            if manifest is not None:
                raise ValueError("custom manifest not supported with LOCAL_RAY")
            return ray_cluster(
                local=True, log_to_driver=log_to_driver, logging_level=logging_level
            )

        if cluster is None:
            raise ValueError("cluster name is required for non-LOCAL_RAY cluster types")
        cluster_def = self._cluster_manager.load(cluster)
        if cluster_def is None:
            raise Exception(
                f"cluster definition '{cluster}' not found. "
                f"Create a new cluster with define_cluster()"
            )

        supported_types = {GenevaClusterType.KUBE_RAY}
        if GenevaClusterType(cluster_def.cluster_type) not in supported_types:
            raise ValueError(
                f"cluster_type must be one of {supported_types} to use context()"
            )
        use_portforwarding = cluster_def.as_dict()["kuberay"].get(
            "use_portforwarding", True
        )
        rc = cluster_def.to_ray_cluster()
        rc.on_exit = on_exit or ExitMode.DELETE

        manifest_def = None
        if manifest is not None:
            manifest_def = self._manifest_manager.load(manifest)
            if manifest_def is None:
                raise Exception(
                    f"manifest definition '{manifest}' not found. "
                    f"Create a new manifest with define_manifest()"
                )

            # image explicitly provided in manifest takes precedence
            if img := manifest_def.head_image:
                _LOG.debug(f"overriding cluster head image from manifest: {img}")
                rc.head_group.image = img
            if img := manifest_def.worker_image:
                _LOG.debug(f"overriding cluster worker image from manifest: {img}")
                for wg in rc.worker_groups:
                    wg.image = img

        return ray_cluster(
            use_portforwarding=use_portforwarding,
            ray_cluster=rc,
            manifest=manifest_def,
            extra_env={
                "RAY_BACKEND_LOG_LEVEL": "info",
                "RAY_LOG_TO_DRIVER": "1",
                "RAY_RUNTIME_ENV_LOG_TO_DRIVER_ENABLED": "true",
            },
            log_to_driver=log_to_driver,
            logging_level=logging_level,
        )

    def sql(self, query: str) -> pa.Table:
        """Execute a raw SQL query.

        It uses the Flight SQL engine to execute the query.

        Parameters
        ----------
        query: str
            SQL query to execute

        Returns
        -------
        pyarrow.Table
            Result of the query in a `pyarrow.Table`

        TODO
        ----
        - Support pagination
        - Support query parameters
        """
        info = self.flight_client.execute(query)
        return self.flight_client.do_get(info.endpoints[0].ticket).read_all()

namespace_impl

namespace_impl = namespace_impl

namespace_properties

namespace_properties = namespace_properties

system_namespace

system_namespace = (
    system_namespace if namespace_impl else []
)

namespace_client

namespace_client: LanceNamespace | None

Returns namespace client if using namespace connection.

flight_client

flight_client: FlightSQLClient

close

close() -> None

Close the connection.

Source code in geneva/db.py

def close(self) -> None:
    """Close the connection."""
    if self._flight_client is not None:
        self._flight_client.close()
    if self._ldb is not None and hasattr(self._ldb, "_conn"):
        # go to the async client and eagerly close the connection
        self._ldb._conn.close()  # type: ignore[attr-defined]

table_names

table_names(
    page_token: str | None = None,
    limit: int | None = None,
    *args,
    **kwargs,
) -> Iterable[str]

List all available tables and views.

Source code in geneva/db.py

@override
def table_names(
    self, page_token: str | None = None, limit: int | None = None, *args, **kwargs
) -> Iterable[str]:
    """List all available tables and views."""
    return self._connect.table_names(
        *args, page_token=page_token, limit=limit or 10, **kwargs
    )

open_table

open_table(
    name: str,
    storage_options: dict[str, str] | None = None,
    index_cache_size: int | None = None,
    version: int | None = None,
    namespace: list[str] | None = None,
    *args,
    **kwargs,
) -> Table

Open a Lance Table.

Parameters:

name (str) –

Name of the table.
storage_options (dict[str, str] | None, default: None ) –

Additional options for the storage backend. Options already set on the connection will be inherited by the table, but can be overridden here. See available options at https://lancedb.github.io/lancedb/guides/storage/
namespace (list[str] | None, default: None ) –

Namespace path for the table (e.g., ["workspace"] for workspace.t)

Source code in geneva/db.py

@override
def open_table(
    self,
    name: str,
    storage_options: dict[str, str] | None = None,
    index_cache_size: int | None = None,
    version: int | None = None,
    namespace: list[str] | None = None,
    *args,
    **kwargs,
) -> "Table":
    """Open a Lance Table.

    Parameters
    ----------
    name: str
        Name of the table.
    storage_options: dict[str, str], optional
        Additional options for the storage backend.
        Options already set on the connection will be inherited by the table,
        but can be overridden here. See available options at
        [https://lancedb.github.io/lancedb/guides/storage/](https://lancedb.github.io/lancedb/guides/storage/)
    namespace: list[str], optional
        Namespace path for the table (e.g., ["workspace"] for workspace.t)

    """
    from .table import Table

    storage_options = storage_options or self._storage_options
    namespace = namespace if namespace is not None else []

    return Table(
        self,
        name,
        namespace=namespace,
        index_cache_size=index_cache_size,
        storage_options=storage_options,
        version=version,
    )

create_table

create_table(
    name: str,
    data: DATA | None = None,
    schema: Schema | LanceModel | None = None,
    mode: str = "create",
    exist_ok: bool = False,
    on_bad_vectors: str = "error",
    fill_value: float = 0.0,
    storage_options: dict[str, str] | None = None,
    *args,
    **kwargs,
) -> Table

Create a Table in the lake

Parameters:

name (str) –

The name of the table
data (DATA | None, default: None ) –
User must provide at least one of data or schema. Acceptable types are:
- list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
schema (Schema | LanceModel | None, default: None ) –
Acceptable types are:
- pyarrow.Schema
- [LanceModel][lancedb.pydantic.LanceModel]
mode (str, default: 'create' ) –

The mode to use when creating the table. Can be either "create" or "overwrite". By default, if the table already exists, an exception is raised. If you want to overwrite the table, use mode="overwrite".
exist_ok (bool, default: False ) –

If a table by the same name already exists, then raise an exception if exist_ok=False. If exist_ok=True, then open the existing table; it will not add the provided data but will validate against any schema that's specified.
on_bad_vectors (str, default: 'error' ) –

What to do if any of the vectors are not the same size or contain NaNs. One of "error", "drop", "fill".

Source code in geneva/db.py

@override
def create_table(  # type: ignore
    self,
    name: str,
    data: DATA | None = None,
    schema: pa.Schema | LanceModel | None = None,
    mode: str = "create",
    exist_ok: bool = False,
    on_bad_vectors: str = "error",
    fill_value: float = 0.0,
    storage_options: dict[str, str] | None = None,
    *args,
    **kwargs,
) -> "Table":  # type: ignore
    """Create a Table in the lake

    Parameters
    ----------
    name: str
        The name of the table
    data: The data to initialize the table, *optional*
        User must provide at least one of `data` or `schema`.
        Acceptable types are:

        - list-of-dict
        - pandas.DataFrame
        - pyarrow.Table or pyarrow.RecordBatch
    schema: The schema of the table, *optional*
        Acceptable types are:

        - pyarrow.Schema
        - [LanceModel][lancedb.pydantic.LanceModel]
    mode: str; default "create"
        The mode to use when creating the table.
        Can be either "create" or "overwrite".
        By default, if the table already exists, an exception is raised.
        If you want to overwrite the table, use mode="overwrite".
    exist_ok: bool, default False
        If a table by the same name already exists, then raise an exception
        if exist_ok=False. If exist_ok=True, then open the existing table;
        it will not add the provided data but will validate against any
        schema that's specified.
    on_bad_vectors: str, default "error"
        What to do if any of the vectors are not the same size or contain NaNs.
        One of "error", "drop", "fill".
    """
    from .table import Table

    # Handle new_table_enable_stable_row_ids validation and normalization
    # Accept both string "true" and boolean True for user convenience
    if storage_options:
        enable_stable = storage_options.get("new_table_enable_stable_row_ids")
        if enable_stable in ("true", True, 1):
            # Normalize to string "true" for lancedb
            storage_options = dict(storage_options)
            storage_options["new_table_enable_stable_row_ids"] = "true"

            # Validate exist_ok: ensure existing table has stable row IDs
            if exist_ok:
                self._validate_existing_table_has_stable_row_ids(name)

    # Create table through lancedb
    self._connect.create_table(
        name,
        data,
        schema,
        mode,
        *args,
        exist_ok=exist_ok,
        on_bad_vectors=on_bad_vectors,
        fill_value=fill_value,
        storage_options=storage_options,
        **kwargs,
    )
    # Extract namespace from kwargs to pass to Table
    namespace = kwargs.get("namespace")
    return Table(self, name, namespace=namespace, storage_options=storage_options)

create_view

create_view(
    name: str, query: str, materialized: bool = False
) -> Table

Create a View from a Query.

Parameters:

name (str) –

Name of the view.
query (str) –

SQL query to create the view.
materialized (bool, default: False ) –

If True, the view is materialized.

Source code in geneva/db.py

def create_view(
    self,
    name: str,
    query: str,
    materialized: bool = False,
) -> "Table":
    """Create a View from a Query.

    Parameters
    ----------
    name: str
        Name of the view.
    query: str
        SQL query to create the view.
    materialized: bool, optional
        If True, the view is materialized.
    """
    if materialized:
        # idea, rename the provided name, and use it as the basis for the
        # materialized view.
        # - how do we add the udfs to the final materialized view table?
        NotImplementedError(
            "creating materialized view via sql query is not supported yet."
        )

    # TODO add test coverage here
    self.sql(f"CREATE VIEW {name} AS ({query})")
    return self.open_table(name)

create_materialized_view

create_materialized_view(
    name: str,
    query: GenevaQueryBuilder,
    with_no_data: bool = True,
) -> Table

Create a materialized view

Parameters:

name (str) –

Name of the materialized view.
query (GenevaQueryBuilder) –

Query to create the view.
with_no_data (bool, default: True ) –

If True, the view is materialized, if false it is ready for refresh.

Source code in geneva/db.py

def create_materialized_view(
    self,
    name: str,
    query: "GenevaQueryBuilder",
    with_no_data: bool = True,
) -> "Table":
    """
    Create a materialized view

    Parameters
    ----------
    name: str
        Name of the materialized view.
    query: GenevaQueryBuilder
        Query to create the view.
    with_no_data: bool, optional
        If True, the view is materialized, if false it is ready for refresh.
    """
    from geneva.query import GenevaQueryBuilder

    if not isinstance(query, GenevaQueryBuilder):
        raise ValueError(
            "Materialized views only support plain queries (where, select)"
        )

    tbl = query.create_materialized_view(self, name)
    if not with_no_data and hasattr(tbl, "refresh_view"):
        tbl.refresh_view(name)  # type: ignore[attr-defined]

    return tbl

drop_view

drop_view(name: str) -> Table

Drop a view.

Source code in geneva/db.py

def drop_view(self, name: str) -> pa.Table:
    """Drop a view."""
    return self.sql(f"DROP VIEW {name}")

drop_table

drop_table(name: str, *args, **kwargs) -> None

Drop a table.

Source code in geneva/db.py

@override
def drop_table(self, name: str, *args, **kwargs) -> None:
    """Drop a table."""
    self._connect.drop_table(name, *args, **kwargs)

define_cluster

define_cluster(name: str, cluster: GenevaCluster) -> None

Define a persistent Geneva cluster. This will upsert the cluster definition by name. The cluster can then be provisioned using context(cluster=name).

Parameters:

name (str) –

Name of the cluster. This will be used as the key when upserting and provisioning the cluster. The cluster name must comply with RFC 12123.
cluster (GenevaCluster) –

The cluster definition to store.

Source code in geneva/db.py

def define_cluster(self, name: str, cluster: "GenevaCluster") -> None:  # noqa: F821
    """
    Define a persistent Geneva cluster. This will upsert the cluster definition by
    name. The cluster can then be provisioned using `context(cluster=name)`.

    Parameters
    ----------
    name: str
        Name of the cluster. This will be used as the key when upserting and
        provisioning the cluster. The cluster name must comply with RFC 12123.
    cluster: GenevaCluster
        The cluster definition to store.
    """
    from geneva.cluster.mgr import ClusterConfigManager

    if self._cluster_manager is None:
        self._cluster_manager = ClusterConfigManager(
            self, namespace=self.system_namespace
        )

    cluster.name = name
    cluster.validate()
    self._cluster_manager.upsert(cluster)

list_clusters

list_clusters() -> list[GenevaCluster]

List the cluster definitions. These can be defined using define_cluster().

Returns:

Iterable of GenevaCluster –

List of Geneva cluster definitions

Source code in geneva/db.py

def list_clusters(self) -> list["GenevaCluster"]:  # noqa: F821
    """
    List the cluster definitions. These can be defined using `define_cluster()`.

    Returns
    -------
    Iterable of GenevaCluster
        List of Geneva cluster definitions
    """
    from geneva.cluster.mgr import ClusterConfigManager

    if self._cluster_manager is None:
        self._cluster_manager = ClusterConfigManager(
            self, namespace=self.system_namespace
        )
    return self._cluster_manager.list()

delete_cluster

delete_cluster(name: str) -> None

Delete a Geneva cluster definition.

Parameters:

name (str) –

Name of the cluster to delete.

Source code in geneva/db.py

def delete_cluster(self, name: str) -> None:  # noqa: F821
    """
    Delete a Geneva cluster definition.

    Parameters
    ----------
    name: str
        Name of the cluster to delete.
    """
    from geneva.cluster.mgr import ClusterConfigManager

    if self._cluster_manager is None:
        self._cluster_manager = ClusterConfigManager(
            self, namespace=self.system_namespace
        )

    self._cluster_manager.delete(name)

define_manifest

define_manifest(
    name: str,
    manifest: GenevaManifest,
    uploader: Uploader | None = None,
) -> None

Define a persistent Geneva Manifest that represents the files and dependencies used in the execution environment. This will upsert the manifest definition by name and upload the required artifacts. The manifest can then be used with context(manifest=name).

Parameters:

name (str) –

Name of the manifest. This will be used as the key when upserting and loading the manifest.
manifest (GenevaManifest) –

The manifest definition to use.
uploader (Uploader | None, default: None ) –

An optional, custom Uploader to use. If not provided, the uploader will be auto-detected based on the environment configuration.

Source code in geneva/db.py

def define_manifest(
    self,
    name: str,
    manifest: "GenevaManifest",  # noqa: F821
    uploader: Uploader | None = None,
) -> None:
    """
    Define a persistent Geneva Manifest that represents the files and dependencies
    used in the execution environment. This will upsert the manifest definition by
    name and upload the required artifacts. The manifest can then be used with
    `context(manifest=name)`.

    Parameters
    ----------
    name: str
        Name of the manifest. This will be used as the key when upserting and
        loading the manifest.
    manifest: GenevaManifest
        The manifest definition to use.
    uploader: Uploader, optional
        An optional, custom Uploader to use. If not provided, the uploader will be
        auto-detected based on the
        environment configuration.
    """
    from geneva.manifest.mgr import ManifestConfigManager

    if self._manifest_manager is None:
        self._manifest_manager = ManifestConfigManager(
            self, namespace=self.system_namespace
        )

    # Ensure the manifest table exists before creating Uploader
    # This guarantees the table location can be queried
    _ = self._manifest_manager.get_table()

    # If no uploader is provided, create one with manifest table context
    if uploader is None:
        from geneva.manifest.mgr import MANIFEST_TABLE_NAME

        if self.namespace_impl and self.namespace_properties:
            # Create uploader for namespace table
            # Build table_id for manifest table
            table_id = self.system_namespace + [MANIFEST_TABLE_NAME]
            uploader = Uploader(
                namespace_impl=self.namespace_impl,
                namespace_properties=self.namespace_properties,
                table_id=table_id,
            )
        else:
            # For local databases, use database-level manifest table
            # Build table_id with just the table name
            table_id = [MANIFEST_TABLE_NAME]
            uploader = Uploader(
                db_uri=self.uri,
                table_id=table_id,
            )

    with upload_local_env(
        # todo: implement excludes
        uploader=uploader,
        zip_output_dir=manifest.local_zip_output_dir,
        delete_local_zips=manifest.delete_local_zips,
        skip_site_packages=manifest.skip_site_packages,
    ) as zips:
        m = copy.deepcopy(manifest)
        m.name = name
        m.zips = zips
        m.checksum = manifest.compute_checksum()
        self._manifest_manager.upsert(m)

list_manifests

list_manifests() -> list[GenevaManifest]

List the manifest definitions. These can be defined using define_manifest().

Returns:

Iterable of GenevaManifest –

List of Geneva manifest definitions

Source code in geneva/db.py

def list_manifests(self) -> list["GenevaManifest"]:  # noqa: F821
    """
    List the manifest definitions. These can be defined using `define_manifest()`.

    Returns
    -------
    Iterable of GenevaManifest
        List of Geneva manifest definitions
    """
    from geneva.manifest.mgr import ManifestConfigManager

    if self._manifest_manager is None:
        self._manifest_manager = ManifestConfigManager(
            self, namespace=self.system_namespace
        )
    return self._manifest_manager.list()

delete_manifest

delete_manifest(name: str) -> None

Delete a Geneva manifest definition.

Parameters:

name (str) –

Name of the manifest to delete.

Source code in geneva/db.py

def delete_manifest(self, name: str) -> None:  # noqa: F821
    """
    Delete a Geneva manifest definition.

    Parameters
    ----------
    name: str
        Name of the manifest to delete.
    """
    from geneva.manifest.mgr import ManifestConfigManager

    if self._manifest_manager is None:
        self._manifest_manager = ManifestConfigManager(
            self, namespace=self.system_namespace
        )

    self._manifest_manager.delete(name)

context

context(
    cluster: str | None = None,
    manifest: str | None = None,
    cluster_type=KUBE_RAY,
    on_exit=None,
    log_to_driver: bool = True,
    logging_level=INFO,
) -> AbstractContextManager[None]

Context manager for a Geneva Execution Environment. This will provision a cluster based on the cluster definition and the manifest provided. By default, the context manager will delete the cluster on exit. This can be configured with the on_exit parameter.

Parameters:

cluster (str | None, default: None ) –

Name of the persisted cluster definition to use. This will raise an exception if the cluster definition was not defined via define_cluster(). This parameter is ignored if cluster_type is GenevaClusterType.LOCAL_RAY.
manifest (str | None, default: None ) –

Name of the persisted manifest to use. This will raise an exception if the manifest definition was not defined via define_manifest().
cluster_type –

Type of the cluster to use. By default, KUBE_RAY will be used. To start a local Ray cluster, use GenevaClusterType.LOCAL_RAY.
on_exit –

Exit mode for the cluster. By default, the cluster will be deleted when the context manager exits. To retain the cluster on errors, use ExitMode.DELETE_ON_SUCCESS. To always retain the cluster, use ExitMode.RETAIN.
log_to_driver (bool, default: True ) –

Whether to send Ray worker logs to the driver. Defaults to True for better visibility in tests and debugging.
logging_level –

The logging level for Ray workers. Use logging.DEBUG for detailed logs.

Source code in geneva/db.py

def context(
    self,
    cluster: str | None = None,
    manifest: str | None = None,
    cluster_type=GenevaClusterType.KUBE_RAY,
    on_exit=None,
    log_to_driver: bool = True,
    logging_level=logging.INFO,
) -> contextlib.AbstractContextManager[None]:
    """Context manager for a Geneva Execution Environment.
        This will provision a cluster based on the cluster
        definition and the manifest provided.
        By default, the context manager will delete the cluster on exit.
        This can be configured with the on_exit parameter.
    Parameters
    ----------
    cluster: str
        Name of the persisted cluster definition to use. This will
        raise an exception if the cluster definition was not
        defined via `define_cluster()`. This parameter is ignored
        if `cluster_type` is `GenevaClusterType.LOCAL_RAY`.
    manifest: str
        Name of the persisted manifest to use. This will
        raise an exception if the manifest definition was not
        defined via `define_manifest()`.
    cluster_type: GenevaClusterType, optional, default GenevaClusterType.KUBE_RAY
        Type of the cluster to use. By default, KUBE_RAY will be used.
        To start a local Ray cluster, use `GenevaClusterType.LOCAL_RAY`.
    on_exit: ExitMode, optional, default ExitMode.DELETE
        Exit mode for the cluster. By default, the cluster will be deleted when the
        context manager exits.
        To retain the cluster on errors, use `ExitMode.DELETE_ON_SUCCESS`.
        To always retain the cluster, use `ExitMode.RETAIN`.
    log_to_driver: bool, optional, default True
        Whether to send Ray worker logs to the driver. Defaults to True for
        better visibility in tests and debugging.
    logging_level: int, optional, default logging.INFO
        The logging level for Ray workers. Use logging.DEBUG for detailed logs.
    """
    from geneva.cluster.mgr import ClusterConfigManager
    from geneva.manifest.mgr import ManifestConfigManager
    from geneva.runners.ray._mgr import ray_cluster
    from geneva.runners.ray.raycluster import ExitMode

    if self._cluster_manager is None:
        self._cluster_manager = ClusterConfigManager(
            self, namespace=self.system_namespace
        )
    if self._manifest_manager is None:
        self._manifest_manager = ManifestConfigManager(
            self, namespace=self.system_namespace
        )

    if GenevaClusterType(cluster_type) == GenevaClusterType.LOCAL_RAY:
        if manifest is not None:
            raise ValueError("custom manifest not supported with LOCAL_RAY")
        return ray_cluster(
            local=True, log_to_driver=log_to_driver, logging_level=logging_level
        )

    if cluster is None:
        raise ValueError("cluster name is required for non-LOCAL_RAY cluster types")
    cluster_def = self._cluster_manager.load(cluster)
    if cluster_def is None:
        raise Exception(
            f"cluster definition '{cluster}' not found. "
            f"Create a new cluster with define_cluster()"
        )

    supported_types = {GenevaClusterType.KUBE_RAY}
    if GenevaClusterType(cluster_def.cluster_type) not in supported_types:
        raise ValueError(
            f"cluster_type must be one of {supported_types} to use context()"
        )
    use_portforwarding = cluster_def.as_dict()["kuberay"].get(
        "use_portforwarding", True
    )
    rc = cluster_def.to_ray_cluster()
    rc.on_exit = on_exit or ExitMode.DELETE

    manifest_def = None
    if manifest is not None:
        manifest_def = self._manifest_manager.load(manifest)
        if manifest_def is None:
            raise Exception(
                f"manifest definition '{manifest}' not found. "
                f"Create a new manifest with define_manifest()"
            )

        # image explicitly provided in manifest takes precedence
        if img := manifest_def.head_image:
            _LOG.debug(f"overriding cluster head image from manifest: {img}")
            rc.head_group.image = img
        if img := manifest_def.worker_image:
            _LOG.debug(f"overriding cluster worker image from manifest: {img}")
            for wg in rc.worker_groups:
                wg.image = img

    return ray_cluster(
        use_portforwarding=use_portforwarding,
        ray_cluster=rc,
        manifest=manifest_def,
        extra_env={
            "RAY_BACKEND_LOG_LEVEL": "info",
            "RAY_LOG_TO_DRIVER": "1",
            "RAY_RUNTIME_ENV_LOG_TO_DRIVER_ENABLED": "true",
        },
        log_to_driver=log_to_driver,
        logging_level=logging_level,
    )

sql

sql(query: str) -> Table

Execute a raw SQL query.

It uses the Flight SQL engine to execute the query.

Parameters:

query (str) –

SQL query to execute

Returns:

Table –

Result of the query in a pyarrow.Table

TODO

Support pagination
Support query parameters

Source code in geneva/db.py

def sql(self, query: str) -> pa.Table:
    """Execute a raw SQL query.

    It uses the Flight SQL engine to execute the query.

    Parameters
    ----------
    query: str
        SQL query to execute

    Returns
    -------
    pyarrow.Table
        Result of the query in a `pyarrow.Table`

    TODO
    ----
    - Support pagination
    - Support query parameters
    """
    info = self.flight_client.execute(query)
    return self.flight_client.do_get(info.endpoints[0].ticket).read_all()