Skip to content

Data Operations in LanceDB

Once you have created a table, there are several ways to modify its data. You can:

  • Ingest and add new records to your table;
  • Update existing records that match specific conditions;
  • Use the powerful Merge Insert function for more complex operations like upserting or replacing ranges of data.

These operations allow you to keep your table data current and maintain it exactly as needed for your use case. Let's look at each of these operations in detail.

Note

These examples demonstrate common usage patterns. For complete API details and advanced options, refer to our Python and TypeScript SDK documentation.

Connecting to LanceDB

Before performing any operations, you'll need to connect to LanceDB. The connection method depends on whether you're using LanceDB Cloud or the open source version.

import lancedb

# Connect to LanceDB Cloud
db = lancedb.connect(
  uri="db://your-project-slug",
  api_key="your-api-key",
  region="us-east-1"
)
import lancedb

# Connect to local LanceDB
db = lancedb.connect("./data")  # Local directory for data storage
import * as lancedb from "@lancedb/lancedb"

// Connect to LanceDB Cloud
const db = await lancedb.connect({
  uri: "db://your-project-slug",
  apiKey: "your-api-key",
  region: "us-east-1"
});
import * as lancedb from "@lancedb/lancedb"

// Connect to local LanceDB
const db = await lancedb.connect("./data");  // Local directory for data storage

Data Insertion

Basic Data Insertion

import lancedb
import pyarrow as pa

# connect to LanceDB Cloud
db = lancedb.connect(
  uri="db://your-project-slug",
  api_key="your-api-key",
  region="us-east-1"
)

# create an empty table with schema
data = [
    {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
    {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
    {"vector": [10.2, 100.8], "item": "baz", "price": 30.0},
    {"vector": [1.4, 9.5], "item": "fred", "price": 40.0},
]

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])

table_name = "basic_ingestion_example"
table = db.create_table(table_name, schema=schema, mode="overwrite")
table.add(data)
import * as lancedb from "@lancedb/lancedb"
import { Schema, Field, Float32, FixedSizeList, Utf8 } from "apache-arrow";

const db = await lancedb.connect({
  uri: "db://your-project-slug",
  apiKey: "your-api-key",
  region: "us-east-1"
});

console.log("Creating table from JavaScript objects");
const data = [
    { vector: [3.1, 4.1], item: "foo", price: 10.0 },
    { vector: [5.9, 26.5], item: "bar", price: 20.0 },
    { vector: [10.2, 100.8], item: "baz", price: 30.0},
    { vector: [1.4, 9.5], item: "fred", price: 40.0},
]

const tableName = "js_objects_example";
const table = await db.createTable(tableName, data, {
    mode: "overwrite"
});

console.log("\nCreating a table with a predefined schema then add data to it");
const tableName = "schema_example";

// Define schema
// create an empty table with schema
const schema = new Schema([
    new Field(
    "vector",
    new FixedSizeList(2, new Field("float32", new Float32())),
    ),
    new Field("item", new Utf8()),
    new Field("price", new Float32()),
]);

// Create an empty table with schema
const table = await db.createEmptyTable(tableName, schema, {
    mode: "overwrite",
});

// Add data to the schema-defined table
const data = [
    { vector: [3.1, 4.1], item: "foo", price: 10.0 },
    { vector: [5.9, 26.5], item: "bar", price: 20.0 },
    { vector: [10.2, 100.8], item: "baz", price: 30.0},
    { vector: [1.4, 9.5], item: "fred", price: 40.0},
]

await table.add(data);

Vector Column Type

The vector column needs to be a pyarrow.FixedSizeList type.

Using Pydantic Models

from lancedb.pydantic import Vector, LanceModel

import pyarrow as pa

# Define a Pydantic model
class Content(LanceModel):
    movie_id: int
    vector: Vector(128)
    genres: str
    title: str
    imdb_id: int

    @property
    def imdb_url(self) -> str:
        return f"https://www.imdb.com/title/tt{self.imdb_id}"

# Create table with Pydantic model schema
table_name = "pydantic_example"
table = db.create_table(table_name, schema=Content, mode="overwrite")

Using Nested Models

You can use nested Pydantic models to represent complex data structures. For example, you may want to store the document string and the document source name as a nested Document object:

from pydantic import BaseModel

class Document(BaseModel):
    content: str
    source: str

This can be used as the type of a LanceDB table column:

class NestedSchema(LanceModel):
    id: str
    vector: Vector(128)
    document: Document

# Create table with nested schema
table_name = "nested_model_example"
table = db.create_table(table_name, schema=NestedSchema, mode="overwrite")

This creates a struct column called document that has two subfields called content and source:

In [28]: table.schema
Out[28]:
id: string not null
vector: fixed_size_list<item: float>[128] not null
    child 0, item: float
document: struct<content: string not null, source: string not null> not null
    child 0, content: string not null
    child 1, source: string not null

Batch Data Insertion

It is recommended to use itertators to add large datasets in batches when creating your table in one go. Data will be automatically compacted for the best query performance.

import pyarrow as pa

def make_batches():
    for i in range(5):  # Create 3 batches
        yield pa.RecordBatch.from_arrays(
            [
                pa.array([[3.1, 4.1], [5.9, 26.5]],
                        pa.list_(pa.float32(), 2)),
                pa.array([f"item{i*2+1}", f"item{i*2+2}"]),
                pa.array([float((i*2+1)*10), float((i*2+2)*10)]),
            ],
            ["vector", "item", "price"],
        )

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])
# Create table with batches
table_name = "batch_ingestion_example"
table = db.create_table(table_name, make_batches(), schema=schema, mode="overwrite")
console.log("\nBatch ingestion example with product catalog data");
const tableName = "product_catalog";

// Vector dimension for product embeddings (realistic dimension for text embeddings)
const vectorDim = 128;

// Create random embedding vector of specified dimension
const createRandomEmbedding = (dim: number) => Array(dim).fill(0).map(() => Math.random() * 2 - 1);

// Create table with initial batch of products
const initialBatch = Array(10).fill(0).map((_, i) => ({
    product_id: `PROD-${1000 + i}`,
    name: `Product ${i + 1}`,
    category: ["electronics", "home", "office"][i % 3],
    price: 10.99 + (i * 5.99),
    vector: createRandomEmbedding(vectorDim)
}));

const table = await db.createTable(tableName, initialBatch, { 
    mode: "overwrite"
});

// Second batch - 25 more products
const batch2 = Array(25).fill(0).map((_, i) => ({
    product_id: `PROD-${2000 + i}`,
    name: `Premium Product ${i + 1}`,
    category: ["electronics", "kitchen", "outdoor", "office", "gaming"][i % 5],
    price: 25.99 + (i * 7.49),
    vector: createRandomEmbedding(vectorDim)
}));

await table.add(batch2);

// Third batch - 15 more products in a different category
const batch3 = Array(15).fill(0).map((_, i) => ({
    product_id: `PROD-${3000 + i}`,
    name: `Budget Product ${i + 1}`,
    category: ["essentials", "budget", "basics"][i % 3],
    price: 5.99 + (i * 2.50),
    vector: createRandomEmbedding(vectorDim)
}));

await table.add(batch3);

Data Modification

Update Operations

This can be used to update zero to all rows depending on how many rows match the where clause. The update queries follow the form of a SQL UPDATE statement. The where parameter is a SQL filter that matches on the metadata columns. The values or values_sql parameters are used to provide the new values for the columns.

Parameter Type Description
where str The SQL where clause to use when updating rows. For example, 'x = 2' or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error.
values dict The values to update. The keys are the column names and the values are the values to set.
values_sql dict The values to update. The keys are the column names and the values are the SQL expressions to set. For example, {'x': 'x + 1'} will increment the value of the x column by 1.

SQL syntax

See SQL filters for more information on the supported SQL syntax.

Warning

Updating nested columns is not yet supported.

API Reference: lancedb.table.Table.update

import lancedb

import pandas as pd

# Create a table from a pandas DataFrame
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})

tbl = db.create_table("test_table", data, mode="overwrite")
# Update the table where x = 2
tbl.update(where="x = 2", values={"vector": [10, 10]})
# Get the updated table as a pandas DataFrame
df = tbl.to_pandas()
print(df)
import lancedb

import pandas as pd

# Create a table from a pandas DataFrame
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})

async_tbl = await async_db.create_table("update_table_async", data)
# Update the table where x = 2
await async_tbl.update({"vector": [10, 10]}, where="x = 2")
# Get the updated table as a pandas DataFrame
df = await async_tbl.to_pandas()
# Print the DataFrame
print(df)

Output

    x  vector
0  1  [1.0, 2.0]
1  3  [5.0, 6.0]
2  2  [10.0, 10.0]

API Reference: lancedb.Table.update

import * as lancedb from "@lancedb/lancedb";

const db = await lancedb.connect("./.lancedb");

const data = [
    {x: 1, vector: [1, 2]},
    {x: 2, vector: [3, 4]},
    {x: 3, vector: [5, 6]},
];
const tbl = await db.createTable("my_table", data)

await tbl.update({ 
    values: { vector: [10, 10] },
    where: "x = 2"
});

Updating using a sql query

The values parameter is used to provide the new values for the columns as literal values. You can also use the values_sql / valuesSql parameter to provide SQL expressions for the new values. For example, you can use values_sql="x + 1" to increment the value of the x column by 1.

# Update the table where x = 2
tbl.update(values_sql={"x": "x + 1"})
print(tbl.to_pandas())
# Update the table where x = 2
await async_tbl.update(updates_sql={"x": "x + 1"})
print(await async_tbl.to_pandas())

Output

    x  vector
0  2  [1.0, 2.0]
1  4  [5.0, 6.0]
2  3  [10.0, 10.0]

Coming Soon!

Note

When rows are updated, they are moved out of the index. The row will still show up in ANN queries, but the query will not be as fast as it would be if the row was in the index. If you update a large proportion of rows, consider rebuilding the index afterwards.

Delete Operations

Remove rows that match a condition.

table = db.open_table("update_table_example")

# delete data
predicate = "price = 30.0"
table.delete(predicate)
table = await db.openTable("update_table_example");

// delete data
const predicate = "price = 30.0";
await table.delete(predicate);

Permanent Deletion

Delete operations are permanent and cannot be undone. Always ensure you have backups or are certain before deleting data.

Merge Operations

The merge insert command is a flexible API that can be used to perform upsert, insert_if_not_exists, and replace_range_ operations.

Use scalar indexes to speed up merge insert

The merge insert command performs a join between the input data and the target table on the key you provide. This requires scanning that entire column, which can be expensive for large tables. To speed up this operation, create a scalar index on the join column, which will allow LanceDB to find matches without scanning the whole table.

Read more about scalar indices in the Scalar Index guide.

Embedding Functions

Like the create table and add APIs, the merge insert API will automatically compute embeddings if the table has an embedding definition in its schema. If the input data doesn't contain the source column, or the vector column is already filled, the embeddings won't be computed.

Upsert

upsert updates rows if they exist and inserts them if they don't. To do this with merge insert, enable both when_matched_update_all() and when_not_matched_insert_all().

# Create example table
users_table_name = "users_example"
table = db.create_table(
    users_table_name,
    [
        {"id": 0, "name": "Alice"},
        {"id": 1, "name": "Bob"},
    ],
    mode="overwrite",
)
print(f"Created users table with {table.count_rows()} rows")

# Prepare data for upsert
new_users = [
    {"id": 1, "name": "Bobby"},  # Will update existing record
    {"id": 2, "name": "Charlie"},  # Will insert new record
]

# Upsert by id
(
    users_table.merge_insert("id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(new_users)
)

# Verify results - should be 3 records total
print(f"Total users: {users_table.count_rows()}")  # 3
// Create example table
const table = await db.createTable("users", [
  { id: 0, name: "Alice" },
  { id: 1, name: "Bob" },
]);

// Prepare data for upsert
const newUsers = [
  { id: 1, name: "Bobby" },  // Will update existing record
  { id: 2, name: "Charlie" },  // Will insert new record
];

// Upsert by id
await table
  .mergeInsert("id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute(newUsers);

// Verify results - should be 3 records total
const count = await table.countRows();
console.log(`Total users: ${count}`);  // 3

Insert-if-not-exists

This will only insert rows that do not have a match in the target table, thus preventing duplicate rows. To do this with merge insert, enable just when_not_matched_insert_all().

# Create example table
table = db.create_table(
    "domains",
    [
        {"domain": "google.com", "name": "Google"},
        {"domain": "github.com", "name": "GitHub"},
    ],
)

# Prepare new data - one existing and one new record
new_domains = [
    {"domain": "google.com", "name": "Google"},
    {"domain": "facebook.com", "name": "Facebook"},
]

# Insert only if domain doesn't exist
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)

# Verify count - should be 3 (original 2 plus 1 new)
print(f"Total domains: {table.count_rows()}")  # 3
// Create example table
const table = await db.createTable(
  "domains", 
  [
    { domain: "google.com", name: "Google" },
    { domain: "github.com", name: "GitHub" },
  ]
);

// Prepare new data - one existing and one new record
const newDomains = [
  { domain: "google.com", name: "Google" },
  { domain: "facebook.com", name: "Facebook" },
];

// Insert only if domain doesn't exist
await table.merge_insert("domain")
  .whenNotMatchedInsertAll()
  .execute(newDomains);

// Verify count - should be 3 (original 2 plus 1 new)
const count = await table.countRows();
console.log(`Total domains: ${count}`);  // 3

Replace range

You can also replace a range of rows in the target table with the input data. For example, if you have a table of document chunks, where each chunk has both a doc_id and a chunk_id, you can replace all chunks for a given doc_id with updated chunks.

This can be tricky otherwise because if you try to use upsert when the new data has fewer chunks you will end up with extra chunks. To avoid this, add another clause to delete any chunks for the document that are not in the new data, with when_not_matched_by_source_delete.

# Create example table with document chunks
table = db.create_table(
    "chunks",
    [
        {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
        {"doc_id": 0, "chunk_id": 1, "text": "World"},
        {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
        {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
        {"doc_id": 2, "chunk_id": 0, "text": "Baz"},
    ],
)

# New data - replacing all chunks for doc_id 1 with just one chunk
new_chunks = [
    {"doc_id": 1, "chunk_id": 0, "text": "Zoo"},
]

# Replace all chunks for doc_id 1
(
    table.merge_insert(["doc_id"])
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .when_not_matched_by_source_delete("doc_id = 1")
    .execute(new_chunks)
)

# Verify count for doc_id = 1 - should be 2 
print(f"Chunks for doc_id = 1: {table.count_rows('doc_id = 1')}")  # 2
// Create example table with document chunks
const table = await db.createTable(
  "chunks", 
  [
    { doc_id: 0, chunk_id: 0, text: "Hello" },
    { doc_id: 0, chunk_id: 1, text: "World" },
    { doc_id: 1, chunk_id: 0, text: "Foo" },
    { doc_id: 1, chunk_id: 1, text: "Bar" },
    { doc_id: 2, chunk_id: 0, text: "Baz" },
  ]
);

// New data - replacing all chunks for doc_id 1 with just one chunk
const newChunks = [
  { doc_id: 1, chunk_id: 0, text: "Zoo" }
];

// Replace all chunks for doc_id 1
await table.merge_insert(["doc_id"])
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .whenNotMatchedBySourceDelete("doc_id = 1")
  .execute(newChunks);

// Verify count for doc_id = 1 - should be 2 
const count = await table.countRows("doc_id = 1");
console.log(`Chunks for doc_id =1: ${count}`);  // 2

Explore full documentation in our SDK guides: Python and Typescript.


  1. We suggest the best batch size to be 500k