Friday, January 23, 2026

Ingest csv data into Apache Iceberg using spark notebook

 Objective: Ingest csv data into Apache Iceberg using spark notebook


Steps:

1. Start our VM

2. start our docker containers


cd /opt/de

[root@localhost de]# docker container start minio nessie notebook dremio

minio

nessie

notebook

dremio

[root@localhost de]# docker ps -a

CONTAINER ID   IMAGE                         COMMAND                  CREATED       STATUS                   PORTS

                                                                                           NAMES

a2ff384ff8bd   projectnessie/nessie          "/usr/local/s2i/run"     5 days ago    Up 8 minutes             8080/tcp, 8443/tcp, 0.0.0.0:19120->19120/tcp, [::]:19120->19120/

tcp                                                                                        nessie

5b600d98db32   alexmerced/spark33-notebook   "/bin/sh -c '~/.loca…"   5 days ago    Up 8 minutes             0.0.0.0:8888->8888/tcp, [::]:8888->8888/tcp

                                                                                           notebook

f458b4d5aa97   dremio/dremio-oss:latest      "bin/dremio start-fg"    5 days ago    Up 8 minutes             0.0.0.0:9047->9047/tcp, [::]:9047->9047/tcp, 0.0.0.0:31010->3101

0/tcp, [::]:31010->31010/tcp, 0.0.0.0:32010->32010/tcp, [::]:32010->32010/tcp, 45678/tcp   dremio

886545b2f25f   minio/minio                   "/usr/bin/docker-ent…"   5 days ago    Up 8 minutes             0.0.0.0:9000-9001->9000-9001/tcp, [::]:9000-9001->9000-9001/tcp

                                                                                           minio

5ab3191a896b   hello-world                   "/hello"                 5 weeks ago   Exited (0) 5 weeks ago

                                                                                           tender_montalcini

[root@localhost de]#



3. Login to spark notebook browser


Enter the token from notebook container output


http://127.0.0.1:8888/?token=22f81663131ec8ebf8d67b15a9a59d0748e875a4c717ba49


4. Observe the homepage, we have 2 folders mainly


a. notebook

b. sampledata


5. Let us create a new notebook by going into the notebook folder & load a csv content into an iceberg table


******content***


import pyspark

from pyspark.sql import SparkSession

import os



## DEFINE SENSITIVE VARIABLES

NESSIE_URI = os.environ.get("NESSIE_URI") ## Nessie Server URI

WAREHOUSE = os.environ.get("WAREHOUSE") ## BUCKET TO WRITE DATA TOO

AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY") ## AWS CREDENTIALS

AWS_SECRET_KEY = os.environ.get("AWS_SECRET_KEY") ## AWS CREDENTIALS

AWS_S3_ENDPOINT= os.environ.get("AWS_S3_ENDPOINT") ## MINIO ENDPOINT



print(AWS_S3_ENDPOINT)

print(NESSIE_URI)

print(WAREHOUSE)



conf = (

    pyspark.SparkConf()

        .setAppName('app_name')

        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')

        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')

        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')

        .set('spark.sql.catalog.nessie.uri', NESSIE_URI)

        .set('spark.sql.catalog.nessie.ref', 'main')

        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')

        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')

        .set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT)

        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)

        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')

        .set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)

        .set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)

)



## Start Spark Session

spark = SparkSession.builder.config(conf=conf).getOrCreate()

print("Spark Running")



### load a csv into sql view


csv_df=spark.read.format("csv").option("header", "true").load("../sampledata/Worker_Coops.csv")

csv_df.createOrReplaceTempView("csv_open_2025")


### load the view into apache iceberg table or create iceberg table from view


spark.sql("create table if not exists nessie.Worker_Coops_2025 using iceberg as select * from csv_open_2025;").show()

spark.sql("select * from nessie.Worker_Coops_2025 limit 10;").show()




>>> table is created successfully and records are fetched as well. Capture the output next time.


No comments:

Post a Comment

Ingest csv data into Apache Iceberg using spark notebook

  Objective: Ingest csv data into Apache Iceberg using spark notebook Steps: 1. Start our VM 2. start our docker containers cd /opt/de [root...