Objective: Ingest csv data into Apache Iceberg using spark notebook
Steps:
1. Start our VM
2. start our docker containers
cd /opt/de
[root@localhost de]# docker container start minio nessie notebook dremio
minio
nessie
notebook
dremio
[root@localhost de]# docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS
NAMES
a2ff384ff8bd projectnessie/nessie "/usr/local/s2i/run" 5 days ago Up 8 minutes 8080/tcp, 8443/tcp, 0.0.0.0:19120->19120/tcp, [::]:19120->19120/
tcp nessie
5b600d98db32 alexmerced/spark33-notebook "/bin/sh -c '~/.loca…" 5 days ago Up 8 minutes 0.0.0.0:8888->8888/tcp, [::]:8888->8888/tcp
notebook
f458b4d5aa97 dremio/dremio-oss:latest "bin/dremio start-fg" 5 days ago Up 8 minutes 0.0.0.0:9047->9047/tcp, [::]:9047->9047/tcp, 0.0.0.0:31010->3101
0/tcp, [::]:31010->31010/tcp, 0.0.0.0:32010->32010/tcp, [::]:32010->32010/tcp, 45678/tcp dremio
886545b2f25f minio/minio "/usr/bin/docker-ent…" 5 days ago Up 8 minutes 0.0.0.0:9000-9001->9000-9001/tcp, [::]:9000-9001->9000-9001/tcp
minio
5ab3191a896b hello-world "/hello" 5 weeks ago Exited (0) 5 weeks ago
tender_montalcini
[root@localhost de]#
3. Login to spark notebook browser
Enter the token from notebook container output
http://127.0.0.1:8888/?token=22f81663131ec8ebf8d67b15a9a59d0748e875a4c717ba49
4. Observe the homepage, we have 2 folders mainly
a. notebook
b. sampledata
5. Let us create a new notebook by going into the notebook folder & load a csv content into an iceberg table
******content***
import pyspark
from pyspark.sql import SparkSession
import os
## DEFINE SENSITIVE VARIABLES
NESSIE_URI = os.environ.get("NESSIE_URI") ## Nessie Server URI
WAREHOUSE = os.environ.get("WAREHOUSE") ## BUCKET TO WRITE DATA TOO
AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY") ## AWS CREDENTIALS
AWS_SECRET_KEY = os.environ.get("AWS_SECRET_KEY") ## AWS CREDENTIALS
AWS_S3_ENDPOINT= os.environ.get("AWS_S3_ENDPOINT") ## MINIO ENDPOINT
print(AWS_S3_ENDPOINT)
print(NESSIE_URI)
print(WAREHOUSE)
conf = (
pyspark.SparkConf()
.setAppName('app_name')
.set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
.set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
.set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
.set('spark.sql.catalog.nessie.uri', NESSIE_URI)
.set('spark.sql.catalog.nessie.ref', 'main')
.set('spark.sql.catalog.nessie.authentication.type', 'NONE')
.set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
.set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT)
.set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
.set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
.set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
.set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
)
## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")
### load a csv into sql view
csv_df=spark.read.format("csv").option("header", "true").load("../sampledata/Worker_Coops.csv")
csv_df.createOrReplaceTempView("csv_open_2025")
### load the view into apache iceberg table or create iceberg table from view
spark.sql("create table if not exists nessie.Worker_Coops_2025 using iceberg as select * from csv_open_2025;").show()
spark.sql("select * from nessie.Worker_Coops_2025 limit 10;").show()
>>> table is created successfully and records are fetched as well. Capture the output next time.