Usage guide
!pip install -U odp-sdk --quiet
from odp.client import OdpClient # The SDK
from odp.dto.catalog import DataCollectionDto, DatasetDto, DatasetSpec, ObservableDto, DataCollectionSpec
from odp.dto import MetadataDto
from odp.dto.common.contact_info import ContactInfo
from odp.client.exc import OdpFileAlreadyExistsError, OdpResourceExistsError
import pyarrow as pa
import pandas as pd
from datetime import datetime, timedelta, date
from decimal import Decimal
import requests
Create Dataset in Catalog for personal testing
dataset_name= client.personalize_name("tabular-v2-example-dataset")
# Declare a dataset manifest to add to the catalog
dataset_manifest = DatasetDto(
metadata=MetadataDto(
name=dataset_name,
display_name=dataset_name,
description=f'A test dataset for tabular v2, {dataset_name}',
labels={"hubocean.io/test": True, 'tabular_v2': 1.0},
),
spec=DatasetSpec(
storage_controller="registry.hubocean.io/storageController/storage-tabular",
storage_class="registry.hubocean.io/storageClass/tabular",
maintainer=ContactInfo(
contact="User McUsername <user.mcusername@emailprovider.com>",
organisation="Organisation Name",
),
),
)
try:
dataset_dto = client.catalog.create(dataset_manifest)
except OdpResourceExistsError:
dataset_dto = client.catalog.get("catalog.hubocean.io/dataset/" + dataset_manifest.metadata.name)
print("Dataset already exists. Fetched existing dataset:", dataset_dto.metadata.name)
Minimal tabular dataset creation
tab = client.table_v2(dataset_dto)
# 1. Drop the table
tab.drop()
tab = client.table_v2(dataset_dto)
# 2. Create new table with our desired schema
schema = pa.schema([
('id', pa.int64()),
('category', pa.string()),
pa.field('geometry', pa.string(), metadata={ # geometry type with metadata
b'isGeometry': b'1'
})
])
tab.create(schema)
# 3. Verify the new schema
print(tab.schema())
id: int64
category: string
geometry: string
-- field metadata --
isGeometry: '1'
# Make some fake input data
inputdata = [{'id': 1, 'category': 'A', 'geometry': 'POINT(1 1)'}, {'id': 2, 'category': 'B', 'geometry': 'POINT(2 1)'}]
inputdata2 = [{'id': 3, 'category': 'C', 'geometry': 'POINT(2 1)'}, {'id': 4, 'category': 'D', 'geometry': 'POINT(2 2)'}]
Insert data from dictionary
# tx indicates a transaction
with tab as tx:
tx.insert(inputdata)
# Quick output one row to check: see further down guide for full .select() functionality and best practice
next(tab.select().rows())
{'id': 1, 'category': 'A', 'geometry': 'POINT(1 1)'}
Insert data from dataframe
# Creating the DataFrame
df = pd.DataFrame(inputdata2)
df
with tab as tx:
tx.insert(df.to_dict(orient='records'))
# Quick output one row to check: see further down guide for full .select() functionality and best practice
next(tab.select().rows())
{'id': 3, 'category': 'C', 'geometry': 'POINT(2 1)'}
# Quick output defined number of rows to check: see further down guide for full .select() functionality and best practice
next(tab.select().pages(size=4))
[{'id': 3, 'category': 'C', 'geometry': 'POINT(2 1)'},
{'id': 4, 'category': 'D', 'geometry': 'POINT(2 2)'},
{'id': 3, 'category': 'C', 'geometry': 'POINT(2 1)'},
{'id': 4, 'category': 'D', 'geometry': 'POINT(2 2)'}]
Clean up and remove data
# .drop() deletes all data and the schema
tab.drop()
# if you wish to remove the catalog entry here use this:
#client.catalog.delete(dataset_dto)
try:
outputrow = next(tab.select().rows())
except FileNotFoundError as e:
print(f"Error: {e} No data in the table")
outputrow = None
Error: No data in the table
Complex tabular dataset creation
tab = client.table_v2(dataset_dto)
schema = pa.schema([
# Standard fields demonstrating basic type support
# String fields can be indexed and are suitable for exact matching and sorting
pa.field('id', pa.string(), nullable=False), # Required field example
# Numeric types supporting mathematical operations and range queries
pa.field('int_value', pa.int64()), # For exact counting/integers
pa.field('float_value', pa.float64()), # For measurements/decimals
# Decimal type - note only 10,2 precision is supported in the tabular controller
pa.field('precise_value', pa.decimal128(10, 2)), # For exact decimal calculations
# Boolean field for flags/states
pa.field('flag', pa.bool_()),
# Temporal fields with indexing capabilities
pa.field('date_only', pa.date32()), # Date without time
pa.field('timestamp_ms', pa.timestamp('ms'), metadata={
b'index': b'1' # Enables temporal range queries and sorting
}),
# Geometry handling - demonstrates spatial capabilities
# The geometry field enables spatial queries (intersects, contains, within)
pa.field('geometry', pa.string(), metadata={
b'isGeometry': b'1', # Marks as geometry type
b'index': b'1', # Enables spatial indexing
b'big': b'1' # Stores as separate file if large
}),
# Geographic coordinate fields - enables H3 and other geo functions
pa.field('latitude', pa.float64(), metadata={
b'geo': b'latitude' # Marks for geo operations
}),
pa.field('longitude', pa.float64(), metadata={
b'geo': b'longitude' # Marks for geo operations
}),
# Large data field examples - automatically managed by the big column handler
pa.field('large_text', pa.string(), metadata={
b'big': b'1' # Stored separately if exceeds STR_LIMIT (128 bytes)
}),
pa.field('large_binary', pa.binary(), metadata={
b'big': b'1' # Stored separately if exceeds STR_LIMIT
})
])
try:
tab.create(schema)
print(tab.schema())
except Exception as e:
print(f"Error creating table: {e}")
print("\nSchema may already exist:")
print(tab.schema())
id: string not null
int_value: int64
float_value: double
precise_value: decimal128(10, 2)
flag: bool
date_only: date32[day]
timestamp_ms: timestamp[ms]
-- field metadata --
index: '1'
geometry: string
-- field metadata --
isGeometry: '1'
index: '1'
big: '1'
latitude: double
-- field metadata --
geo: 'latitude'
longitude: double
-- field metadata --
geo: 'longitude'
large_text: string
-- field metadata --
big: '1'
large_binary: binary
-- field metadata --
big: '1'
base_time = datetime(2024, 1, 1, 12, 0, 0)
samplerows = [{'id': 'LOCATION-001', 'int_value': 100, 'float_value': 23.456, 'precise_value': Decimal('123.45'), 'flag': True, 'date_only': base_time.date(), 'timestamp_ms': base_time, 'geometry': 'POINT(-122.4194 37.7749)', 'latitude': 37.7749, 'longitude': -122.4194, 'large_text': 'COASTAL_DATA' * 1000, 'large_binary': b'BINARY_DATA_1' * 1000}, {'id': 'LOCATION-002', 'int_value': 200, 'float_value': 45.678, 'precise_value': Decimal('567.89'), 'flag': False, 'date_only': (base_time + timedelta(days=1)).date(), 'timestamp_ms': base_time + timedelta(hours=12), 'geometry': 'POINT(-98.4936 29.4241)', 'latitude': 29.4241, 'longitude': -98.4936, 'large_text': 'INLAND_DATA' * 1000, 'large_binary': b'BINARY_DATA_2' * 1000}]
with tab as tx:
tx.insert(samplerows)
print("Sample row inserted successfully")
Sample row inserted successfully
# Quick output as dataframe: see further down guide for full .select() functionality and best practice
next(tab.select().dataframes())
# Quick output as dataframe: see further down guide for full .select() functionality and best practice
# Select now does some filters and and only returns some columns: see further down guide for more details
next(tab.select('flag == True AND precise_value < 200.0 AND latitude > 35', cols=['id','geometry']).dataframes())
Clean up and remove data
# .drop() deletes all data and the schema
tab.drop()
# if you wish to remove the catalog entry here use this:
#client.catalog.delete(dataset_dto)
try:
outputrow = next(tab.select().rows())
except FileNotFoundError as e:
print(f"Error: {e} No data in the table")
outputrow = None
Error: No data in the table
Large tabular dataset creation
# Download an 11MB CSV file of real data (environmental data from the Mareano program in Norway)
response = requests.get('https://sadataproducts.blob.core.windows.net/huboceandatapublic/mareano_vulnerable.csv')
# Save the content to a file
with open('mareano_vulnerable.csv', 'wb') as file:
file.write(response.content)
df = pd.read_csv('mareano_vulnerable.csv')
df['dato'] = pd.to_datetime(df['dato'])
df.head(2)
# Define your schema
schema = pa.schema([
('gml_id', pa.string()),
('lokalId', pa.string()),
('navnerom', pa.string()),
('versjonId', pa.int64()), # 'long' maps to int64
('målemetode', pa.string()),
('datauttaksdato', pa.string()),
('opphav', pa.string()),
('referanseStasjon', pa.int64()),
('toktNummer', pa.int64()),
('prøveNummer', pa.int64()),
('antall', pa.int64()),
('dato', pa.date32()),
('lengdegrad', pa.float64()), # 'double' maps to float64
('breddegrad', pa.float64()),
('indikatorVME', pa.string()),
('indicatorVME', pa.string()),
pa.field('geometry_epsg25833', pa.string()),
pa.field('geometry', pa.string(), metadata={ # geometry type with metadata
b'isGeometry': b'1',
b'big': b'1',
b'index': b'1'
})
])
tab.drop()
try:
tab.create(schema)
print(tab.schema())
except Exception as e:
print(f"Error creating table: {e}")
print("\nSchema may already exist:")
print(tab.schema())
gml_id: string
lokalId: string
navnerom: string
versjonId: int64
målemetode: string
datauttaksdato: string
opphav: string
referanseStasjon: int64
toktNummer: int64
prøveNummer: int64
antall: int64
dato: date32[day]
lengdegrad: double
breddegrad: double
indikatorVME: string
indicatorVME: string
geometry_epsg25833: string
geometry: string
-- field metadata --
isGeometry: '1'
big: '1'
index: '1'
with tab as tx:
tx.insert(df.to_dict(orient='records'))
print("Sample data from dataframe inserted successfully")
Sample data from dataframe inserted successfully
tab = client.table_v2(dataset_dto)
Selecting data
Selecting certain columns
# One row: all columns
rows = tab.select().rows()
next(rows)
{'gml_id': 'id68c569dc-1605-4af0-a75b-4b6858cfe245',
'lokalId': '5d4269c0-bba8-436a-b774-55d7991c178e',
'navnerom': 'https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata',
'versjonId': 20201201,
'målemetode': 'GNSSKodemålingEnkleMålinger',
'datauttaksdato': '2024-10-27T08:01:37',
'opphav': 'Havforskningsinstituttet',
'referanseStasjon': 1,
'toktNummer': 2006612,
'prøveNummer': 53,
'antall': 0,
'dato': datetime.date(2006, 6, 13),
'lengdegrad': 21.655499,
'breddegrad': 70.456017,
'indikatorVME': 'Blomkålkorallskog',
'indicatorVME': 'Nephtheidae medow',
'geometry_epsg25833': 'POINT (748055.943029 7830322.159233)',
'geometry': 'POINT (21.6555 70.456)'}
# One row: select columns
rows = tab.select(cols=['dato','geometry']).rows()
next(rows)
{'dato': datetime.date(2006, 6, 13), 'geometry': 'POINT (21.6555 70.456)'}
Selecting methods
- NB for any of these methods the .rows(), .pages() and .dataframes() creates a generator object.
- using next() will load the first item of this object (whether a row, page or dataframe)
- successive next() will load the next item, unless you refresh the .select() command
- you can iterate over these or concatenate to get the full outputs
rows = tab.select(cols=['dato','geometry']).rows()
next(rows)
{'dato': datetime.date(2006, 6, 13), 'geometry': 'POINT (21.6555 70.456)'}
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.3911 71.3519)'}
for i, row in enumerate(rows):
if i >= 3:
break
print(row)
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.4169 71.3309)'}
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.4892 71.3288)'}
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.5345 71.2944)'}
pages = (tab.select(cols=['dato','geometry']).pages(size=5)) ## User defined sizes
next(pages)
[{'dato': datetime.date(2006, 6, 13), 'geometry': 'POINT (21.6555 70.456)'},
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.3911 71.3519)'},
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.4169 71.3309)'},
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.4892 71.3288)'},
{'dato': datetime.date(2006, 5, 27), 'geometry': 'POINT (22.5345 71.2944)'}]
dataframes = tab.select().dataframes() #2000 rows at a time
next(dataframes).tail(2)
batches = tab.select().batches() #Platform defined size for performance
next(batches)
pyarrow.RecordBatch
gml_id: string
lokalId: string
navnerom: string
versjonId: int64
målemetode: string
datauttaksdato: string
opphav: string
referanseStasjon: int64
toktNummer: int64
prøveNummer: int64
antall: int64
dato: date32[day]
lengdegrad: double
breddegrad: double
indikatorVME: string
indicatorVME: string
geometry_epsg25833: string
geometry: string
----
gml_id: ["id68c569dc-1605-4af0-a75b-4b6858cfe245","id4f09dad2-c9f9-4c2d-8ac7-088597b613ce","ided86afca-a75e-4860-8554-3a06faaaed86","id52e24142-8e59-43b4-8d2e-ea18e10fc3c4","idd7216d5d-d605-4d01-aafc-3f4bdec477b9","id14d54066-5b42-43e5-b0d0-28cd8c953e9a","idd31c948c-8f70-4573-a965-2b6d1eb483ba","id34c749ef-6bb8-44f5-9e1d-21e679ca8690","idf0774d13-325f-405a-a3dd-3c7c8506d5e6","idb3b3c7e7-7f25-4952-bae8-156edc9f016c",...,"id5167ff60-058c-49ba-99fc-00b8202a0564","idd7577b02-0496-4cae-be40-2379cfd8940d","idf7cacaba-b981-46fe-bebd-7ce0e1733b83","id99d10998-e71d-4d6b-b376-b53437035c34","idb62085e3-3439-464c-a542-d9c50ae9fbac","id9016ec1e-2c66-46e6-a2f9-be4f59016c4c","id34d81efc-a4f0-4b57-9667-b3fcffe3bc3b","idbeb5463e-357e-4762-a236-bada387e07ef","idab6c883d-81d6-4000-b0d1-ddf4016210e2","id400875c3-fe87-4a84-ad5b-2cb7fe390855"]
lokalId: ["5d4269c0-bba8-436a-b774-55d7991c178e","1cc3b1e5-4b20-428d-8df3-524bb57cb79b","370dad98-0f0d-40cd-899f-247d7e6ca514","aa824a4d-d3f9-408c-a6b9-558938c2c9b8","232af407-c87c-42be-9bb5-1cb710345b40","92f84e2e-13f4-4985-b6ba-260530b6782e","5533b0ee-ef1f-4799-b339-f405e4c97bdc","a3e08608-d95a-4772-bdd8-d5c610a13ea9","2887fa6b-830a-4d62-bae6-7b18a771f5fd","a155a3f0-7a0b-43f8-8d9f-e21a1522fc0d",...,"c3014828-a680-487f-a579-e0c1f8bbe249","3d420c87-37f4-4744-93ec-64b333364266","8cb60bdc-33ee-40fd-a6e0-50b592305352","71eba7f2-812e-4671-aec2-6134519da253","bfcaba41-dc3b-4d3c-9d92-12f650a99eac","c691a82b-36fb-4e31-b5bb-ea31946bcb65","a8614ddf-0109-4302-8708-17ce88b61a72","b3ad115f-7d95-4000-aece-b78c14324506","a664bdcd-31b8-4ebb-a4d4-7fc633d9cccf","13d0dccf-a533-483e-a24a-57f191a76b74"]
navnerom: ["https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata",...,"https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata","https://data.geonorge.no/sosi/biomangfold/sarbare_marine_bunndyr_observasjonsdata"]
versjonId: [20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201,...,20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201,20201201]
målemetode: ["GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger",...,"GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger","GNSSKodemålingEnkleMålinger"]
datauttaksdato: ["2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37","2024-10-27T08:01:37",...,"2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40","2024-10-27T08:01:40"]
opphav: ["Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet",...,"Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet","Havforskningsinstituttet"]
referanseStasjon: [1,2,3,4,5,6,7,8,9,10,...,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978]
toktNummer: [2006612,2006612,2006612,2006612,2006612,2006612,2006612,2006612,2006612,2006612,...,2019106,2019106,2019106,2019106,2019106,2019106,2019106,2019106,2019106,2019106]
prøveNummer: [53,2,3,4,5,6,7,8,10,11,...,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035]
...
Full dataset output - via pages
pages = list(tab.select().pages())
all_rows = [row for page in pages for row in page]
pd.DataFrame(all_rows).tail(2)
Full dataset output - via dataframes
dfs = list(tab.select().dataframes())
pd.concat(dfs, ignore_index=True).tail(2)
Select - querying on column values
next(tab.select("(referanseStasjon < 100 OR referanseStasjon > 3547) AND antall > 0", cols=['referanseStasjon','geometry','dato','antall']).dataframes())
df = next(tab.select("dato >= ?", vars=[date.fromisoformat('2024-04-28')], cols=['gml_id','geometry','dato','antall']).dataframes())
df
Select - geospatial data query - within
next(tab.select('geometry within "POLYGON((20 70, 22 70, 22 74, 20 74, 20 70))"', cols=['referanseStasjon','geometry','dato','antall']).dataframes())
Select - geospatial data query - contains
next(tab.select('geometry contains "POINT (20.533 74.6876)"', cols=['referanseStasjon','geometry','dato','antall']).dataframes())
Select - geospatial data query - intersect
next(tab.select('geometry intersect "POLYGON((20 70, 22 70, 22 74, 20 74, 20 70))"', cols=['referanseStasjon','geometry','dato','antall']).dataframes())
Clean up and remove data
# .drop() deletes all data and the schema
tab.drop()
try:
outputrow = next(tab.select().rows())
except FileNotFoundError as e:
print(f"Error: {e} No data in the table")
outputrow = None
Error: No data in the table
# to delete the catalog entry created at the top
client.catalog.delete(dataset_dto)