Lance ❤️ Ray

Ray effortlessly scale up ML workload to large distributed compute environment.

Lance format is one of the official Ray data sources:

  • Lance Data Source ray.data.read_lance()

  • Lance Data Sink ray.data.Dataste.write_lance()

import ray
import pandas as pd

ray.init()

data = [
    {"id": 1, "name": "alice"},
    {"id": 2, "name": "bob"},
    {"id": 3, "name": "charlie"}
]
ray.data.from_items(data).write_lance("./alice_bob_and_charlie.lance")

# It can be read via lance directly
df = (
    lance.
    dataset("./alice_bob_and_charlie.lance")
    .to_table()
    .to_pandas()
    .sort_values(by=["id"])
    .reset_index(drop=True)
)
assert df.equals(pd.DataFrame(data)), "{} != {}".format(
    df, pd.DataFrame(data)
)

# Or via Ray.data.read_lance
ray_df = (
    ray.data.read_lance("./alice_bob_and_charlie.lance")
    .to_pandas()
    .sort_values(by=["id"])
    .reset_index(drop=True)
)
assert df.equals(ray_df)