Data Engineering/Spark

[Spark] Apache Spark + Apache Zeppelin 실행하기

박경태 2022. 4. 15. 12:10

사용한 파일은 아래 깃허브로 구성을 했다.

https://github.com/ParkGyeongTae/spark-pgt/tree/main/2_spark-cluster-zeppelin

 

GitHub - ParkGyeongTae/spark-pgt

Contribute to ParkGyeongTae/spark-pgt development by creating an account on GitHub.

github.com

 

먼저 파일 구조를 보자

 

dockerfile

FROM ubuntu:18.04
LABEL maintainer "ParkGyeongTae"

# apt 설치시 입력요청 무시
ENV DEBIAN_FRONTEND=noninteractive

# apt 미러서버 미국(default) -> 한국 변경
RUN sed -i 's@archive.ubuntu.com@kr.archive.ubuntu.com@g' /etc/apt/sources.list

# 자주 사용하는 패키지 설치
RUN apt-get update && \
    apt-get install net-tools -y && \
    apt-get install iputils-ping -y && \
    apt-get install vim -y && \
    apt-get install wget -y

# 작업영역 /home
WORKDIR /home

# jdk
RUN apt-get install openjdk-8-jdk -y

# spark-3.2.1-bin-hadoop3.2
RUN wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz && \
    tar -xvf spark-3.2.1-bin-hadoop3.2.tgz && \
    mv spark-3.2.1-bin-hadoop3.2 spark && \
    rm -rf spark-3.2.1-bin-hadoop3.2.tgz

# python 3.8.0
RUN apt-get install python3.8 -y && \
    apt-get install python3-pip -y && \
    rm -rf /usr/bin/python3 && \
    ln -s /usr/bin/python3.8 /usr/bin/python3 && \
    ln -s /usr/bin/python3.8 /usr/bin/python

# zeppelin-0.10.1
RUN wget https://dlcdn.apache.org/zeppelin/zeppelin-0.10.1/zeppelin-0.10.1-bin-all.tgz && \
    tar -zxf zeppelin-0.10.1-bin-all.tgz && \
    mv zeppelin-0.10.1-bin-all zeppelin && \
    rm -rf zeppelin-0.10.1-bin-all.tgz

WORKDIR /home/zeppelin/interpreter/jdbc

RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.24.tar.gz && \
    tar -zxf mysql-connector-java-8.0.24.tar.gz && \
    rm -rf mysql-connector-java-8.0.24.tar.gz

WORKDIR /home

# pip3 설정
RUN mkdir /root/.pip && \
    set -x \
    && { \
    echo '[global]'; \
    echo 'timeout = 60'; \
    echo '[freeze]'; \
    echo 'timeout = 10'; \
    echo '[list]'; \
    echo 'format = columns'; \
    } > /root/.pip/pip.conf && \
    pip3 install --upgrade pip
    

# 환경설정
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
ENV SPARK_HOME /home/spark
ENV ZEPPELIN_HOME /home/zeppelin
ENV PATH $PATH:$JAVA_HOME/bin:$SPARK_HOME/bin:$ZEPPELIN_HOME/bin

# spark 설정파일 수정
COPY ./spark-cluster/spark-cluster-conf/spark-env.sh /home/spark/conf/spark-env.sh
COPY ./spark-cluster/spark-cluster-conf/log4j.properties /home/spark/conf/log4j.properties

RUN rm -rf /home/spark/conf/spark-env.sh.template && \
    rm -rf /home/spark/conf/log4j.properties.template && \
    rm -rf /home/spark/bin/*.cmd

# zeppelin 설정파일 수정
RUN rm -rf /home/zeppelin/conf/zeppelin-env.sh.template && \
    rm -rf /home/zeppelin/conf/zeppelin-env.cmd.template && \
    rm -rf /home/zeppelin/conf/interpreter.json && \
    rm -rf /home/zeppelin/bin/*.cmd

COPY ./zeppelin/zeppelin-conf/zeppelin-env.sh /home/zeppelin/conf/zeppelin-env.sh
COPY ./zeppelin/zeppelin-conf/interpreter.json /home/zeppelin/conf/interpreter.json

# 컨테이너 실행시 spark 자동실행
COPY ./spark-cluster/spark-cluster-entrypoint/entrypoint-spark.sh /usr/local/bin/

ENTRYPOINT ["entrypoint-spark.sh"]

 

docker-compose.yml

version: '2.1'

services:

  spark-master:
    hostname: spark-master
    container_name: spark-master
    image: spark-cluster:0.03
    restart: always
    ports:
      - 18080:18080 # 스파크 웹 UI
    stdin_open: true
    tty: true
    environment:
      - SPARK_MODE=master
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=17077
      - SPARK_MASTER_WEBUI_PORT=18080

  spark-slave-1:
    hostname: spark-slave-1
    container_name: spark-slave-1
    image: spark-cluster:0.03
    restart: always
    stdin_open: true
    tty: true
    ports:
      - 8081:8081 # 스파크 워커 웹 UI
    environment:
      - SPARK_MODE=slave
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=17077
      - SPARK_MASTER_WEBUI_PORT=18080
      - SPARK_WORKER_CORES=4
      - SPARK_WORKER_MEMORY=4g
      - SPARK_WORKER_WEBUI_PORT=8081
      - SPARK_WORKER_PORT=18081
    depends_on:
      - spark-master

  spark-slave-2:
    hostname: spark-slave-2
    container_name: spark-slave-2
    image: spark-cluster:0.03
    restart: always
    stdin_open: true
    tty: true
    ports:
      - 8082:8082 # 스파크 워커 웹 UI
    environment:
      - SPARK_MODE=slave
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=17077
      - SPARK_MASTER_WEBUI_PORT=18080
      - SPARK_WORKER_CORES=4
      - SPARK_WORKER_MEMORY=4g
      - SPARK_WORKER_WEBUI_PORT=8082
      - SPARK_WORKER_PORT=18082
    depends_on:
      - spark-master

  spark-slave-3:
    hostname: spark-slave-3
    container_name: spark-slave-3
    image: spark-cluster:0.03
    restart: always
    stdin_open: true
    tty: true
    ports:
      - 8083:8083 # 스파크 워커 웹 UI
    environment:
      - SPARK_MODE=slave
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=17077
      - SPARK_MASTER_WEBUI_PORT=18080
      - SPARK_WORKER_CORES=4
      - SPARK_WORKER_MEMORY=4g
      - SPARK_WORKER_WEBUI_PORT=8083
      - SPARK_WORKER_PORT=18083
    depends_on:
      - spark-master

  zeppelin:
    hostname: zeppelin
    container_name: zeppelin
    image: spark-cluster:0.03
    restart: always
    stdin_open: true
    tty: true
    ports:
      - 9999:9999
    environment:
      - SPARK_MODE=zeppelin
      - SPARK_MASTER=spark://spark-master:17077
      - ZEPPELIN_PORT=9999
      - SPARK_APP_NAME=MyFirstZeppelin
      - TOTAL_EXECUTOR_CORES=4
    depends_on:
      - spark-master

 

docker ps -a
docker images

 

./full_build.sh

 

docker-compose up -d

 

localhost:18080

 

localhost:9999

 

create new notebook

 

create

 

 

sc

 

localhost:18080

 

application

 

val data = sc.parallelize(1 to 100 by 1)

 

data.count