Merge insert

The merge insert command is a flexible API that can be used to perform:

Upsert
Insert-if-not-exists
Replace range

It works by joining the input data with the target table on a key you provide. Often this key is a unique row id key. You can then specify what to do when there is a match and when there is not a match. For example, for upsert you want to update if the row has a match and insert if the row doesn't have a match. Whereas for insert-if-not-exists you only want to insert if the row doesn't have a match.

You can also read more in the API reference:

Python
- Sync: lancedb.table.Table.merge_insert
- Async: lancedb.table.AsyncTable.merge_insert
Typescript: lancedb.Table.mergeInsert

Use scalar indices to speed up merge insert

The merge insert command needs to perform a join between the input data and the target table on the on key you provide. This requires scanning that entire column, which can be expensive for large tables. To speed up this operation, you can create a scalar index on the on column, which will allow LanceDB to find matches without having to scan the whole tables.

Read more about scalar indices in Building a Scalar Index guide.

Embedding Functions

Like the create table and add APIs, the merge insert API will automatically compute embeddings if the table has a embedding definition in its schema. If the input data doesn't contain the source column, or the vector column is already filled, then the embeddings won't be computed. See the Embedding Functions guide for more information.

Upsert

Upsert updates rows if they exist and inserts them if they don't. To do this with merge insert, enable both when_matched_update_all() and when_not_matched_insert_all().

PythonTypescript

Sync APIAsync API

table = db.create_table(
    "users",
    [
        {"id": 0, "name": "Alice"},
        {"id": 1, "name": "Bob"},
    ],
)
new_users = [
    {"id": 1, "name": "Bobby"},
    {"id": 2, "name": "Charlie"},
]
res = (
    table.merge_insert("id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(new_users)
)
table.count_rows()  # 3
res  # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}

table = await db.create_table(
    "users",
    [
        {"id": 0, "name": "Alice"},
        {"id": 1, "name": "Bob"},
    ],
)
new_users = [
    {"id": 1, "name": "Bobby"},
    {"id": 2, "name": "Charlie"},
]
res = await (
    table.merge_insert("id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(new_users)
)
await table.count_rows()  # 3
res
# MergeResult(version=2, num_updated_rows=1,
# num_inserted_rows=1, num_deleted_rows=0)

@lancedb/lancedb

const table = await db.createTable("users", [
  { id: 0, name: "Alice" },
  { id: 1, name: "Bob" },
]);

const newUsers = [
  { id: 1, name: "Bobby" },
  { id: 2, name: "Charlie" },
];
await table
  .mergeInsert("id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute(newUsers);

await table.countRows(); // 3

Providing subsets of columns

If a column is nullable, it can be omitted from input data and it will be considered null. Columns can also be provided in any order.

Insert-if-not-exists

To avoid inserting duplicate rows, you can use the insert-if-not-exists command. This will only insert rows that do not have a match in the target table. To do this with merge insert, enable just when_not_matched_insert_all().

PythonTypescript

Sync APIAsync API

table = db.create_table(
    "domains",
    [
        {"domain": "google.com", "name": "Google"},
        {"domain": "github.com", "name": "GitHub"},
    ],
)
new_domains = [
    {"domain": "google.com", "name": "Google"},
    {"domain": "facebook.com", "name": "Facebook"},
]
res = (
    table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
)
table.count_rows()  # 3
res
# MergeResult(version=2, num_updated_rows=0,
# num_inserted_rows=1, num_deleted_rows=0)

    table = await db.create_table(
        "domains",
        [
            {"domain": "google.com", "name": "Google"},
            {"domain": "github.com", "name": "GitHub"},
        ],
    )
    new_domains = [
        {"domain": "google.com", "name": "Google"},
        {"domain": "facebook.com", "name": "Facebook"},
    ]
    res = await (
        table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
    )
    await table.count_rows()  # 3
    res
    # MergeResult(version=2, num_updated_rows=0,
    # num_inserted_rows=1, num_deleted_rows=0)
    assert await table.count_rows() == 3
    assert res.version == 2
    assert res.num_inserted_rows == 1
    assert res.num_deleted_rows == 0
    assert res.num_updated_rows == 0


def test_replace_range(mem_db):
    db = mem_db
    table = db.create_table(
        "chunks",
        [
            {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
            {"doc_id": 0, "chunk_id": 1, "text": "World"},
            {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
            {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
        ],
    )
    new_chunks = [
        {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
    ]
    res = (
        table.merge_insert(["doc_id", "chunk_id"])
        .when_matched_update_all()
        .when_not_matched_insert_all()
        .when_not_matched_by_source_delete("doc_id = 1")
        .execute(new_chunks)
    )
    table.count_rows("doc_id = 1")  # 1
    res
    # MergeResult(version=2, num_updated_rows=1,
    # num_inserted_rows=0, num_deleted_rows=1)
    assert table.count_rows("doc_id = 1") == 1
    assert res.version == 2
    assert res.num_inserted_rows == 0
    assert res.num_deleted_rows == 1
    assert res.num_updated_rows == 1


@pytest.mark.asyncio
async def test_replace_range_async(mem_db_async):
    db = mem_db_async
    table = await db.create_table(
        "chunks",
        [
            {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
            {"doc_id": 0, "chunk_id": 1, "text": "World"},
            {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
            {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
        ],
    )
    new_chunks = [
        {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
    ]
    res = await (
        table.merge_insert(["doc_id", "chunk_id"])
        .when_matched_update_all()
        .when_not_matched_insert_all()
        .when_not_matched_by_source_delete("doc_id = 1")
        .execute(new_chunks)
    )
    await table.count_rows("doc_id = 1")  # 1
    res
    # MergeResult(version=2, num_updated_rows=1,
    # num_inserted_rows=0, num_deleted_rows=1)
    assert await table.count_rows("doc_id = 1") == 1
    assert res.version == 2
    assert res.num_inserted_rows == 0
    assert res.num_deleted_rows == 1
    assert res.num_updated_rows == 1

@lancedb/lancedb

const table2 = await db.createTable("domains", [
  { domain: "google.com", name: "Google" },
  { domain: "github.com", name: "GitHub" },
]);

const newDomains = [
  { domain: "google.com", name: "Google" },
  { domain: "facebook.com", name: "Facebook" },
];
await table2
  .mergeInsert("domain")
  .whenNotMatchedInsertAll()
  .execute(newDomains);
await table2.countRows(); // 3

Replace range

You can also replace a range of rows in the target table with the input data. For example, if you have a table of document chunks, where each chunk has both a doc_id and a chunk_id, you can replace all chunks for a given doc_id with updated chunks. This can be tricky otherwise because if you try to use upsert when the new data has fewer chunks you will end up with extra chunks. To avoid this, add another clause to delete any chunks for the document that are not in the new data, with when_not_matched_by_source_delete.

PythonTypescript

Sync APIAsync API

    table = db.create_table(
        "chunks",
        [
            {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
            {"doc_id": 0, "chunk_id": 1, "text": "World"},
            {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
            {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
        ],
    )
    new_chunks = [
        {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
    ]
    res = (
        table.merge_insert(["doc_id", "chunk_id"])
        .when_matched_update_all()
        .when_not_matched_insert_all()
        .when_not_matched_by_source_delete("doc_id = 1")
        .execute(new_chunks)
    )
    table.count_rows("doc_id = 1")  # 1
    res
    # MergeResult(version=2, num_updated_rows=1,
    # num_inserted_rows=0, num_deleted_rows=1)
    assert table.count_rows("doc_id = 1") == 1
    assert res.version == 2
    assert res.num_inserted_rows == 0
    assert res.num_deleted_rows == 1
    assert res.num_updated_rows == 1


@pytest.mark.asyncio
async def test_replace_range_async(mem_db_async):
    db = mem_db_async
    table = await db.create_table(
        "chunks",
        [
            {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
            {"doc_id": 0, "chunk_id": 1, "text": "World"},
            {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
            {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
        ],
    )
    new_chunks = [
        {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
    ]
    res = await (
        table.merge_insert(["doc_id", "chunk_id"])
        .when_matched_update_all()
        .when_not_matched_insert_all()
        .when_not_matched_by_source_delete("doc_id = 1")
        .execute(new_chunks)
    )
    await table.count_rows("doc_id = 1")  # 1
    res
    # MergeResult(version=2, num_updated_rows=1,
    # num_inserted_rows=0, num_deleted_rows=1)
    assert await table.count_rows("doc_id = 1") == 1
    assert res.version == 2
    assert res.num_inserted_rows == 0
    assert res.num_deleted_rows == 1
    assert res.num_updated_rows == 1

table = await db.create_table(
    "chunks",
    [
        {"doc_id": 0, "chunk_id": 0, "text": "Hello"},
        {"doc_id": 0, "chunk_id": 1, "text": "World"},
        {"doc_id": 1, "chunk_id": 0, "text": "Foo"},
        {"doc_id": 1, "chunk_id": 1, "text": "Bar"},
    ],
)
new_chunks = [
    {"doc_id": 1, "chunk_id": 0, "text": "Baz"},
]
res = await (
    table.merge_insert(["doc_id", "chunk_id"])
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .when_not_matched_by_source_delete("doc_id = 1")
    .execute(new_chunks)
)
await table.count_rows("doc_id = 1")  # 1
res
# MergeResult(version=2, num_updated_rows=1,
# num_inserted_rows=0, num_deleted_rows=1)
assert await table.count_rows("doc_id = 1") == 1
assert res.version == 2
assert res.num_inserted_rows == 0
assert res.num_deleted_rows == 1
assert res.num_updated_rows == 1

@lancedb/lancedb

const table3 = await db.createTable("chunks", [
  { doc_id: 0, chunk_id: 0, text: "Hello" },
  { doc_id: 0, chunk_id: 1, text: "World" },
  { doc_id: 1, chunk_id: 0, text: "Foo" },
  { doc_id: 1, chunk_id: 1, text: "Bar" },
]);

const newChunks = [{ doc_id: 1, chunk_id: 0, text: "Baz" }];

await table3
  .mergeInsert(["doc_id", "chunk_id"])
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .whenNotMatchedBySourceDelete({ where: "doc_id = 1" })
  .execute(newChunks);

await table3.countRows("doc_id = 1"); // 1