Friday, January 23, 2026

Ingest csv data into Apache Iceberg using spark notebook

 Objective: Ingest csv data into Apache Iceberg using spark notebook


Steps:

1. Start our VM

2. start our docker containers


cd /opt/de

[root@localhost de]# docker container start minio nessie notebook dremio

minio

nessie

notebook

dremio

[root@localhost de]# docker ps -a

CONTAINER ID   IMAGE                         COMMAND                  CREATED       STATUS                   PORTS

                                                                                           NAMES

a2ff384ff8bd   projectnessie/nessie          "/usr/local/s2i/run"     5 days ago    Up 8 minutes             8080/tcp, 8443/tcp, 0.0.0.0:19120->19120/tcp, [::]:19120->19120/

tcp                                                                                        nessie

5b600d98db32   alexmerced/spark33-notebook   "/bin/sh -c '~/.loca…"   5 days ago    Up 8 minutes             0.0.0.0:8888->8888/tcp, [::]:8888->8888/tcp

                                                                                           notebook

f458b4d5aa97   dremio/dremio-oss:latest      "bin/dremio start-fg"    5 days ago    Up 8 minutes             0.0.0.0:9047->9047/tcp, [::]:9047->9047/tcp, 0.0.0.0:31010->3101

0/tcp, [::]:31010->31010/tcp, 0.0.0.0:32010->32010/tcp, [::]:32010->32010/tcp, 45678/tcp   dremio

886545b2f25f   minio/minio                   "/usr/bin/docker-ent…"   5 days ago    Up 8 minutes             0.0.0.0:9000-9001->9000-9001/tcp, [::]:9000-9001->9000-9001/tcp

                                                                                           minio

5ab3191a896b   hello-world                   "/hello"                 5 weeks ago   Exited (0) 5 weeks ago

                                                                                           tender_montalcini

[root@localhost de]#



3. Login to spark notebook browser


Enter the token from notebook container output


http://127.0.0.1:8888/?token=22f81663131ec8ebf8d67b15a9a59d0748e875a4c717ba49


4. Observe the homepage, we have 2 folders mainly


a. notebook

b. sampledata


5. Let us create a new notebook by going into the notebook folder & load a csv content into an iceberg table


******content***


import pyspark

from pyspark.sql import SparkSession

import os



## DEFINE SENSITIVE VARIABLES

NESSIE_URI = os.environ.get("NESSIE_URI") ## Nessie Server URI

WAREHOUSE = os.environ.get("WAREHOUSE") ## BUCKET TO WRITE DATA TOO

AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY") ## AWS CREDENTIALS

AWS_SECRET_KEY = os.environ.get("AWS_SECRET_KEY") ## AWS CREDENTIALS

AWS_S3_ENDPOINT= os.environ.get("AWS_S3_ENDPOINT") ## MINIO ENDPOINT



print(AWS_S3_ENDPOINT)

print(NESSIE_URI)

print(WAREHOUSE)



conf = (

    pyspark.SparkConf()

        .setAppName('app_name')

        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')

        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')

        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')

        .set('spark.sql.catalog.nessie.uri', NESSIE_URI)

        .set('spark.sql.catalog.nessie.ref', 'main')

        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')

        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')

        .set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT)

        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)

        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')

        .set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)

        .set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)

)



## Start Spark Session

spark = SparkSession.builder.config(conf=conf).getOrCreate()

print("Spark Running")



### load a csv into sql view


csv_df=spark.read.format("csv").option("header", "true").load("../sampledata/Worker_Coops.csv")

csv_df.createOrReplaceTempView("csv_open_2025")


### load the view into apache iceberg table or create iceberg table from view


spark.sql("create table if not exists nessie.Worker_Coops_2025 using iceberg as select * from csv_open_2025;").show()

spark.sql("select * from nessie.Worker_Coops_2025 limit 10;").show()




>>> table is created successfully and records are fetched as well. Capture the output next time.


Adjust the size of default vmdk Vagrantfile disks (you can use this procedure to relocate + resize as well)

 

Adjust the size of default vmdk Vagrantfile disks (you can use this procedure to relocate + resize as well):


1) Shutdown the VM


2) VBoxManage.exe clonehd box-disk001.vmdk E:\Virtualbox_VMs\voracle9x-docker-sa1\voracle9x-docker-sa1-box-disk001.vdi --format VDI



C:\VirtualBox_VMs\voracle9x-docker-sa1_default_1763076112652_13435>"C:\Program Files\Oracle\VirtualBox"\VBoxManage.exe clonehd box-disk001.vmdk E:\Virtualbox_VMs\voracle9x-docker-sa1\voracle9x-docker-sa1-box-disk001.vdi --format VDI

0%...10%...20%...30%...40%...50%...60%...70%...80%...90%...100%

Clone medium created in format 'VDI'. UUID: f09bc1a5-fdab-44b0-814b-cb599464562f




3) VBoxManage.exe modifyhd E:\Virtualbox_VMs\voracle9x-docker-sa1\voracle9x-docker-sa1-box-disk001.vdi --resize 76800

C:\VirtualBox_VMs\voracle9x-docker-sa1_default_1763076112652_13435>"C:\Program Files\Oracle\VirtualBox"\VBoxManage.exe modifyhd E:\Virtualbox_VMs\voracle9x-docker-sa1\voracle9x-docker-sa1-box-disk001.vdi --resize 76800

0%...10%...20%...30%...40%...50%...60%...70%...80%...90%...100%


C:\VirtualBox_VMs\voracle9x-docker-sa1_default_1763076112652_13435>




4) Attach the new disk to VirtualBox media


5) Attach the new vdi disk in place of the existing vmdk (only disk sata0) attached to the VM using VBox.


6) Boot the VM using "vagrant up"


7) Run "fdisk -l" to list the partition table, we will see the partition table changed and new expanded size visible. in my case it expanded from 32GB to 75GB.


8) Run "cfdisk", in my case only 1 disk, so it picked up all the partitions from that disk


9) Choose /dev/sda3 (which corresponds to / FS) and use arrow key select resize, enter new size 74GB in my case; Write, Accept partition modification "yes"


10) Verify all the pvs, lvs, vgdisplay outputs to ensure the current sizes of the / FS is aligned.


11) Run "pvresize /dev/sda3" [partition corresponds to / FS]


12) Run "pvdisplay /dev/sda3" [partition corresponds to / FS] -- should display the extra space in free space.


13) Run "lvextend -l +100%FREE /dev/mapper/vg_main-lv_root" [vg corresponds to the / FS]


14) Run "xfs_growfs /" to accomadate the growth


15) Finally review the size allocated using  df -Th.


**********************Sample output:


Before:


(base) [root@localhost ~]# pvdisplay

  --- Physical volume ---

  PV Name               /dev/sda3

  VG Name               vg_main

  PV Size               <36.00 GiB / not usable 0

  Allocatable           yes (but full)

  PE Size               4.00 MiB

  Total PE              9215

  Free PE               0

  Allocated PE          9215

  PV UUID               6X5HFj-GRnV-Idjh-BO95-cLUf-biD4-dFPP3z


(base) [root@localhost ~]# pvdisplay /dev/sda3

  --- Physical volume ---

  PV Name               /dev/sda3

  VG Name               vg_main

  PV Size               <36.00 GiB / not usable 0

  Allocatable           yes (but full)

  PE Size               4.00 MiB

  Total PE              9215

  Free PE               0

  Allocated PE          9215

  PV UUID               6X5HFj-GRnV-Idjh-BO95-cLUf-biD4-dFPP3z


(base) [root@localhost ~]# vgdisplay vg_main

  --- Volume group ---

  VG Name               vg_main

  System ID

  Format                lvm2

  Metadata Areas        1

  Metadata Sequence No  5

  VG Access             read/write

  VG Status             resizable

  MAX LV                0

  Cur LV                2

  Open LV               2

  Max PV                0

  Cur PV                1

  Act PV                1

  VG Size               <36.00 GiB

  PE Size               4.00 MiB

  Total PE              9215

  Alloc PE / Size       9215 / <36.00 GiB

  Free  PE / Size       0 / 0

  VG UUID               7VoDZZ-Qqta-yTHp-UDUc-Ovju-UFID-ms8Riy


(base) [root@localhost ~]# lvdisplay /dev/vg_main/lv_root

  --- Logical volume ---

  LV Path                /dev/vg_main/lv_root

  LV Name                lv_root

  VG Name                vg_main

  LV UUID                btd7vi-Qtwj-nsIw-0tqf-2ury-a160-SA966o

  LV Write Access        read/write

  LV Creation host, time localhost.localdomain, 2025-08-28 12:38:32 +0000

  LV Status              available

  # open                 1

  LV Size                <32.00 GiB

  Current LE             8191

  Segments               1

  Allocation             inherit

  Read ahead sectors     auto

  - currently set to     256

  Block device           252:0


(base) [root@localhost ~]# df -Th

Filesystem                  Type      Size  Used Avail Use% Mounted on

devtmpfs                    devtmpfs  4.0M     0  4.0M   0% /dev

tmpfs                       tmpfs     2.9G     0  2.9G   0% /dev/shm

tmpfs                       tmpfs     1.2G   17M  1.2G   2% /run

/dev/mapper/vg_main-lv_root xfs        32G   24G  8.9G  73% /

/dev/sda2                   xfs       960M  217M  744M  23% /boot

vagrant                     vboxsf    466G  404G   63G  87% /vagrant

tmpfs                       tmpfs     593M  4.0K  593M   1% /run/user/1000

tmpfs                       tmpfs     593M  4.0K  593M   1% /run/user/0

(base) [root@localhost ~]



After:

(base) [root@localhost ~]# pvresize /dev/sda3

  Physical volume "/dev/sda3" changed

  1 physical volume(s) resized or updated / 0 physical volume(s) not resized

(base) [root@localhost ~]# pvdisplay /dev/sda3

  --- Physical volume ---

  PV Name               /dev/sda3

  VG Name               vg_main

  PV Size               <74.00 GiB / not usable 16.50 KiB

  Allocatable           yes

  PE Size               4.00 MiB

  Total PE              18943

  Free PE               9728

  Allocated PE          9215

  PV UUID               6X5HFj-GRnV-Idjh-BO95-cLUf-biD4-dFPP3z


(base) [root@localhost ~]# lvextend -l +100%FREE /dev/mapper/vg_main-lv_root

  Size of logical volume vg_main/lv_root changed from <32.00 GiB (8191 extents) to <70.00 GiB (17919 extents).

  Logical volume vg_main/lv_root successfully resized.

(base) [root@localhost ~]# df -Th /

Filesystem                  Type  Size  Used Avail Use% Mounted on

/dev/mapper/vg_main-lv_root xfs    32G   24G  8.9G  73% /

(base) [root@localhost ~]# xfs_growfs /

meta-data=/dev/mapper/vg_main-lv_root isize=512    agcount=4, agsize=2096896 blks

         =                       sectsz=512   attr=2, projid32bit=1

         =                       crc=1        finobt=1, sparse=1, rmapbt=0

         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=0

         =                       exchange=0

data     =                       bsize=4096   blocks=8387584, imaxpct=25

         =                       sunit=0      swidth=0 blks

naming   =version 2              bsize=4096   ascii-ci=0, ftype=1, parent=0

log      =internal log           bsize=4096   blocks=16384, version=2

         =                       sectsz=512   sunit=0 blks, lazy-count=1

realtime =none                   extsz=4096   blocks=0, rtextents=0

data blocks changed from 8387584 to 18349056

(base) [root@localhost ~]# df -Th /

Filesystem                  Type  Size  Used Avail Use% Mounted on

/dev/mapper/vg_main-lv_root xfs    70G   24G   47G  34% /

(base) [root@localhost ~]#

****************



How to fix internet issues on virtualbox guest VMs with NAT

 How to fix internet issues on virtualbox guest VMs with NAT


1) Identify the issue


(base) [root@localhost ~]# ping www.google.com

^C

(base) [root@localhost ~]#


2) Quick fix is to edit /etc/resolv.conf as shown below..


Actual:


(base) [root@localhost ~]# cat /etc/resolv.conf

# Generated by NetworkManager

nameserver 10.0.2.3

(base) [root@localhost ~]#


Replace it as shown below:


(base) [root@localhost ~]# cat /etc/resolv.conf

# Generated by NetworkManager

nameserver 8.8.8.8

nameserver 1.1.1.1

(base) [root@localhost ~]#


3) Test the fix 


(base) [root@localhost ~]# ping google.com

PING google.com (192.178.223.113) 56(84) bytes of data.

64 bytes from yulhrs-in-f113.1e100.net (192.178.223.113): icmp_seq=1 ttl=105 time=7.23 ms

^C

--- google.com ping statistics ---

1 packets transmitted, 1 received, 0% packet loss, time 0ms

rtt min/avg/max/mdev = 7.227/7.227/7.227/0.000 ms

(base) [root@localhost ~]#


4) Make it persistent across restarts


(base) [root@localhost ~]# nmcli conn show

NAME             UUID                                  TYPE      DEVICE

eth0             a313d9c0-8613-468f-a354-e892fe7e3817  ethernet  eth0

System eth1      9c92fad9-6ecb-3e6c-eb4d-8a47c6f50c04  ethernet  eth1

lo               a12a3dcb-af7a-4987-b824-7a9fb48c3b0b  loopback  lo

br-7ca1385b85f2  b51a597f-e9c5-43f1-95bf-68f5ed335192  bridge    br-7ca1385b85f2

docker0          21fd4af2-d44f-4931-9b1a-985fa92769eb  bridge    docker0

(base) [root@localhost ~]#


5) Look for the connection supplying dns


(base) [root@localhost ~]# nmcli dev show | grep DNS

IP4.DNS[1]:                             10.0.2.3

(base) [root@localhost ~]#


6) Examine the below network manager conf file


cat /etc/NetworkManager/system-connections/eth0.nmconnection

[connection]

id=eth0

uuid=a313d9c0-8613-468f-a354-e892fe7e3817

type=ethernet

autoconnect-priority=-100

autoconnect-retries=1

interface-name=eth0

multi-connect=1

timestamp=1756384622

wait-device-timeout=60000


[ethernet]


[ipv4]

method=auto


[ipv6]

addr-gen-mode=eui64

method=auto


[proxy]


[user]

org.freedesktop.NetworkManager.origin=nm-initrd-generator

(base) [root@localhost ~]#


7) Look for dns related setting


grep -i dns /etc/NetworkManager/system-connections/eth0.nmconnection


8) Add the nameservers now which we added above


nmcli con mod "eth0" ipv4.dns "8.8.8.8,1.1.1.1"

nmcli con mod "eth0" ipv4.ignore-auto-dns yes

nmcli con up "eth0"


Actual:

(base) [root@localhost ~]# grep -i dns /etc/NetworkManager/system-connections/eth0.nmconnection

(base) [root@localhost ~]# nmcli con mod "eth0" ipv4.dns "8.8.8.8,1.1.1.1"

(base) [root@localhost ~]# nmcli con mod "eth0" ipv4.ignore-auto-dns yes

(base) [root@localhost ~]# nmcli con up "eth0"

Connection successfully activated (D-Bus active path: /org/freedesktop/NetworkManager/ActiveConnection/6)

(base) [root@localhost ~]#


9) Check if ip is up


(base) [root@localhost ~]# ip a

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000

    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00

    inet 127.0.0.1/8 scope host lo

       valid_lft forever preferred_lft forever

    inet6 ::1/128 scope host

       valid_lft forever preferred_lft forever

2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000

    link/ether 08:00:27:dc:3b:82 brd ff:ff:ff:ff:ff:ff

    altname enp0s3

    inet 10.0.2.15/24 brd 10.0.2.255 scope global dynamic noprefixroute eth0

       valid_lft 86398sec preferred_lft 86398sec

    inet6 fe80::a00:27ff:fedc:3b82/64 scope link noprefixroute

       valid_lft forever preferred_lft forever



10) Verify the file /etc/NetworkManager/system-connections/eth0.nmconnection


(base) [root@localhost ~]# cat /etc/NetworkManager/system-connections/eth0.nmconnection

[connection]

id=eth0

uuid=a313d9c0-8613-468f-a354-e892fe7e3817

type=ethernet

autoconnect-priority=-100

autoconnect-retries=1

interface-name=eth0

multi-connect=1

timestamp=1769037930

wait-device-timeout=60000


[ethernet]


[ipv4]

dns=8.8.8.8;1.1.1.1;

ignore-auto-dns=true

method=auto


[ipv6]

addr-gen-mode=eui64

method=auto


[proxy]


[user]

org.freedesktop.NetworkManager.origin=nm-initrd-generator

(base) [root@localhost ~]#


11) Verify further setup and test internet


(base) [root@localhost ~]# nmcli dev show | grep DNS

IP4.DNS[1]:                             8.8.8.8

IP4.DNS[2]:                             1.1.1.1

(base) [root@localhost ~]# cat /etc/resolv.conf

# Generated by NetworkManager

nameserver 8.8.8.8

nameserver 1.1.1.1

(base) [root@localhost ~]#


12) check internet


(base) [root@localhost ~]# ping google.com

PING google.com (192.178.223.113) 56(84) bytes of data.

64 bytes from yulhrs-in-f113.1e100.net (192.178.223.113): icmp_seq=1 ttl=105 time=7.24 ms

^C

--- google.com ping statistics ---

1 packets transmitted, 1 received, 0% packet loss, time 0ms

rtt min/avg/max/mdev = 7.244/7.244/7.244/0.000 ms

(base) [root@localhost ~]#


Ingest csv data into Apache Iceberg using spark notebook

  Objective: Ingest csv data into Apache Iceberg using spark notebook Steps: 1. Start our VM 2. start our docker containers cd /opt/de [root...