Skip to content

Pandas and PyArrow

Because Lance is built on top of Apache Arrow, LanceDB is tightly integrated with the Python data ecosystem, including Pandas and PyArrow. The sequence of steps in a typical workflow is shown below.

Create dataset

First, we need to connect to a LanceDB database.

import lancedb

uri = "data/sample-lancedb"
db = lancedb.connect(uri)
import lancedb

uri = "data/sample-lancedb"
async_db = await lancedb.connect_async(uri)

We can load a Pandas DataFrame to LanceDB directly.

import pandas as pd

data = pd.DataFrame(
    {
        "vector": [[3.1, 4.1], [5.9, 26.5]],
        "item": ["foo", "bar"],
        "price": [10.0, 20.0],
    }
)
table = db.create_table("pd_table", data=data)
import pandas as pd

data = pd.DataFrame(
    {
        "vector": [[3.1, 4.1], [5.9, 26.5]],
        "item": ["foo", "bar"],
        "price": [10.0, 20.0],
    }
)
await async_db.create_table("pd_table_async", data=data)

Similar to the pyarrow.write_dataset() method, LanceDB's db.create_table() accepts data in a variety of forms.

If you have a dataset that is larger than memory, you can create a table with Iterator[pyarrow.RecordBatch] to lazily load the data:

from typing import Iterable

import pyarrow as pa

def make_batches() -> Iterable[pa.RecordBatch]:
    for i in range(5):
        yield pa.RecordBatch.from_arrays(
            [
                pa.array([[3.1, 4.1], [5.9, 26.5]]),
                pa.array(["foo", "bar"]),
                pa.array([10.0, 20.0]),
            ],
            ["vector", "item", "price"],
        )


schema = pa.schema(
    [
        pa.field("vector", pa.list_(pa.float32())),
        pa.field("item", pa.utf8()),
        pa.field("price", pa.float32()),
    ]
)
table = db.create_table("iterable_table", data=make_batches(), schema=schema)
from typing import Iterable

import pyarrow as pa

def make_batches() -> Iterable[pa.RecordBatch]:
    for i in range(5):
        yield pa.RecordBatch.from_arrays(
            [
                pa.array([[3.1, 4.1], [5.9, 26.5]]),
                pa.array(["foo", "bar"]),
                pa.array([10.0, 20.0]),
            ],
            ["vector", "item", "price"],
        )


schema = pa.schema(
    [
        pa.field("vector", pa.list_(pa.float32())),
        pa.field("item", pa.utf8()),
        pa.field("price", pa.float32()),
    ]
)
await async_db.create_table(
    "iterable_table_async", data=make_batches(), schema=schema
)

You will find detailed instructions of creating a LanceDB dataset in Getting Started and API sections.

We can now perform similarity search via the LanceDB Python API.

# Open the table previously created.
table = db.open_table("pd_table")

query_vector = [100, 100]
# Pandas DataFrame
df = table.search(query_vector).limit(1).to_pandas()
print(df)
# Open the table previously created.
async_tbl = await async_db.open_table("pd_table_async")

query_vector = [100, 100]
# Pandas DataFrame
df = await async_tbl.query().nearest_to(query_vector).limit(1).to_pandas()
print(df)
    vector     item  price    _distance
0  [5.9, 26.5]  bar   20.0  14257.05957

If you have a simple filter, it's faster to provide a where clause to LanceDB's search method. For more complex filters or aggregations, you can always resort to using the underlying DataFrame methods after performing a search.

# Apply the filter via LanceDB
results = table.search([100, 100]).where("price < 15").to_pandas()
assert len(results) == 1
assert results["item"].iloc[0] == "foo"

# Apply the filter via Pandas
df = results = table.search([100, 100]).to_pandas()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via LanceDB
results = (
    await async_tbl.query().nearest_to([100, 100]).where("price < 15").to_pandas()
)
assert len(results) == 1
assert results["item"].iloc[0] == "foo"

# Apply the filter via Pandas
df = results = await async_tbl.query().nearest_to([100, 100]).to_pandas()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"