Tabular Storage
Tabular Storage Key Features
Schemas:
Schemas are the recipe of the data that will be in a dataset. It contains the field names, types and partitioning info of the dataset. The column information of the dataset is provided in the TableSpec format.
-
Create Schema: Easily create a schema object to represent the shape of the data that will be stored in the Tabular Storage.
-
Get Schema: Retrieve schema from the Tabular Storage.
-
Delete Schema: Manage schemas by deleting unwanted or obsolete schemas.
Staging:
Staging is used while uploading data to the Tabular Storage. It lets you store data in a stage before committing it into the dataset.
-
Create Stage: Easily create stage for a Tabular Dataset.
-
Commit Stage: Commit the staged data into the Tabular Dataset.
-
Get Stage: Retrieve stage.
-
List Stages: List stages of a dataset.
-
Delete Stage: Manage stages by deleting unwanted or obsolete stages.
Data Operations:
-
Select Data: Retrieve data from the Tabular Storage as stream, list or Pandas Dataframe.
-
Insert Data: Insert data into the Tabular Storage.
-
Update Data: Update data in the Tabular Storage.
-
Delete Data: Delete data from the Tabular Storage.
Querying:
For querying the data in the Tabular Storage, ODP provides a query language called Object Query Structure (OQS). It is a powerful query language that can be used to filter, sort and aggregate data. For more information on OQS, please refer to the OQS documentation
Functions
create_schema
Create Schema.
Arguments
resource_dto
(DatasetDto): Dataset resource.table_spec
(TableSpec): Specifications of the schema to be created.
Returns
- Specifications of the schema that is being created.
Raises
- OdpResourceExistsError: If the schema already exists with the same identifier
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from odp.client.dto.table_spec import TableSpec
client = OdpClient()
# DatasetDto
my_dataset = {...}
table_schema = {"Data": {"type": "string"}}
my_table_spec = TableSpec(table_schema=table_schema)
mt_table_spec = client.tabular.create_schema(resource_dto=my_dataset, table_spec=my_table_spec)
print(mt_table_spec)
get_schema
Get schema.
Arguments
resource_dto
(DatasetDto): Dataset resource.
Returns
- Specifications of the schema that is being queried.
Raises
- OdpResourceNotFoundError: If the schema cannot be found
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
mt_table_spec = client.tabular.get_schema(resource_dto=my_dataset)
print(mt_table_spec)
delete_schema
Delete schema.
Arguments
resource_dto
(DatasetDto): Dataset resource.delete_data
(Optional(bool), default=False): Bool to specify whether the data should be deleted as well
Raises
- OdpResourceNotFoundError: If the schema cannot be found
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
client.tabular.delete_schema(my_dataset)
print("Dataset deleted successfully")
create_stage_request
Create Stage.
Arguments
resource_dto
(DatasetDto): Dataset resource.
Returns
- Specifications of the stage that is being created.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
my_stage = client.tabular.create_stage_request(resource_dto=my_dataset)
print(my_stage)
commit_stage_request
Commit Stage.
Arguments
resource_dto
(DatasetDto): Dataset resource.table_stage
(TableStage): Stage object.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from odp.client.dto.tabular_store import TableStage
client = OdpClient()
# DatasetDto
my_dataset = {...}
# TableStage
my_stage = {...}
client.tabular.commit_stage_request(resource_dto=my_dataset, table_stage=my_stage)
print("Schema comitted successfully")
get_stage_request
Get Stage.
Arguments
resource_dto
(DatasetDto): Dataset resource.stage
(UUID | TableStage): Stage object or UUID.
Returns
- Stage that is queried for.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from uuid import UUID
client = OdpClient()
# DatasetDto
my_dataset = {...}
# TableStage
my_uuid = {...}
my_stage = client.tabular.get_stage_request(resource_dto=my_dataset, stage=my_uuid)
print(my_stage)
list_stage_request
List Stages for a dataset.
Arguments
resource_dto
(DatasetDto): Dataset resource.
Returns
- Stages that are related to the dataset.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
my_stages = client.tabular.list_stage_request(resource_dto=my_dataset)
print(my_stages)
delete_stage_request
Delete Stage.
Arguments
resource_dto
(DatasetDto): Dataset resource.table_stage
(TableStage): Stage object.force_delete
(Optional(bool), default=False): Bool to specify whether the data should be force deleted. If you want to delete a stage that has the status set ascommit
you need to set this parameter toTrue
else you will get aHTTP 409
error.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from odp.client.dto.tabular_store import TableStage
client = OdpClient()
# DatasetDto
my_dataset = {...}
# TableStage
my_stage = {...}
client.tabular.delete_stage_request(resource_dto=my_dataset, table_stage=my_stage, force_delete=True)
print("Stage deleted successfully")
select_as_stream
Select data from dataset as stream.
Arguments
resource_dto
(DatasetDto): Dataset resource.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.limit
(int): Limit for the number of rows returned.
Yields
- Data that is queried as a stream.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
filter_query = {...}
limit = 1000
data = client.tabular.select_as_stream(my_dataset, filter_query, limit)
print("Dataset's data:", f"{[datapoint for datapoint in data]}")
select_as_list
Select data from dataset as list.
Arguments
resource_dto
(DatasetDto): Dataset resource.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.limit
(int): Limit for the number of rows returned.
Returns
- Data that is queried as a list.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
filter_query = {...}
limit = 1000
data = client.tabular.select_as_list(my_dataset, filter_query, limit)
print(data)
select_as_dataframe
Select data from dataset as Pandas Dataframe.
Arguments
resource_dto
(DatasetDto): Dataset resource.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.
Returns
- Data that is queried as a Pandas Dataframe.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from pandas import DataFrame
client = OdpClient()
# DatasetDto
my_dataset = {...}
filter_query = {...}
dataframe = client.tabular.select_as_dataframe(my_dataset, filter_query)
print(dataframe)
write
Write data to dataset.
Arguments
resource_dto
(DatasetDto): Dataset resource.data
(list): Data to ingest.table_stage
(Optional[TableStage]): Stage object.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from odp.client.dto.tabular_store import TableStage
client = OdpClient()
# DatasetDto
my_dataset = {...}
test_data = [{...}, {...}, {...}]
#TableStage
my_stage = {...}
client.tabular.write(resource_dto=my_dataset, data=test_data, table_stage=my_stage)
print("Dataset's data:", client.tabular.select_as_list(my_dataset))
write_dataframe
Write data to dataset in Pandas DataFrame format.
Arguments
resource_dto
(DatasetDto): Dataset resource.data
(Dataframe): Data to ingest.table_stage
(Optional[TableStage]): Stage object.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from odp.client.dto.tabular_store import TableStage
client = OdpClient()
# DatasetDto
my_dataset = {...}
test_data = DataFrame({...}, {...}, {...})
#TableStage
my_stage = {...}
client.tabular.write_dataframe(resource_dto=my_dataset, data=test_data, table_stage=my_stage)
print("Dataset's data:", client.tabular.select_as_list(my_dataset))
update
Update data from dataset
Arguments
resource_dto
(DatasetDto): Dataset resource.data
(list): Data to ingest.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
data = [{...}, {...}, {...}]
filter_query = {...}
client.tabular.update(resource_dto=my_dataset, data=data, filter_query=filter_query)
print("Dataset's data:", client.tabular.select_as_list(my_dataset))
update_dataframe
Update data from dataset in Pandas DataFrame format
Arguments
resource_dto
(DatasetDto): Dataset resource.data
(DataFrame): Data to ingest.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
from pandas import DataFrame
client = OdpClient()
# DatasetDto
my_dataset = {...}
dataframe = DataFrame({...}, {...}, {...})
filter_query = {...}
client.tabular.update_dataframe(resource_dto=my_dataset, data=dataframe, filter_query=filter_query)
print("Dataset's data:", client.tabular.select_as_list(my_dataset))
delete
Delete data from dataset.
Arguments
resource_dto
(DatasetDto): Dataset resource.filter_query
(Optional[dict]): Filter query in OQS format. Read more about OQS here.
Example
from odp.client import OdpClient
from odp.dto import Metadata
from odp.dto.catalog import DatasetDto, DatasetSpec
from odp.dto.common.contact_info import ContactInfo
client = OdpClient()
# DatasetDto
my_dataset = {...}
filter_query = {...}
client.tabular.delete(resource_dto=my_dataset, filter_query=filter_query)
print("Data deleted successfully")