Data Engineering/Spark
[Spark] Apache Spark + Apache Zeppelin 실행하기
박경태
2022. 4. 15. 12:10
사용한 파일은 아래 깃허브로 구성을 했다.
https://github.com/ParkGyeongTae/spark-pgt/tree/main/2_spark-cluster-zeppelin
GitHub - ParkGyeongTae/spark-pgt
Contribute to ParkGyeongTae/spark-pgt development by creating an account on GitHub.
github.com
먼저 파일 구조를 보자
dockerfile
FROM ubuntu:18.04
LABEL maintainer "ParkGyeongTae"
# apt 설치시 입력요청 무시
ENV DEBIAN_FRONTEND=noninteractive
# apt 미러서버 미국(default) -> 한국 변경
RUN sed -i 's@archive.ubuntu.com@kr.archive.ubuntu.com@g' /etc/apt/sources.list
# 자주 사용하는 패키지 설치
RUN apt-get update && \
apt-get install net-tools -y && \
apt-get install iputils-ping -y && \
apt-get install vim -y && \
apt-get install wget -y
# 작업영역 /home
WORKDIR /home
# jdk
RUN apt-get install openjdk-8-jdk -y
# spark-3.2.1-bin-hadoop3.2
RUN wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz && \
tar -xvf spark-3.2.1-bin-hadoop3.2.tgz && \
mv spark-3.2.1-bin-hadoop3.2 spark && \
rm -rf spark-3.2.1-bin-hadoop3.2.tgz
# python 3.8.0
RUN apt-get install python3.8 -y && \
apt-get install python3-pip -y && \
rm -rf /usr/bin/python3 && \
ln -s /usr/bin/python3.8 /usr/bin/python3 && \
ln -s /usr/bin/python3.8 /usr/bin/python
# zeppelin-0.10.1
RUN wget https://dlcdn.apache.org/zeppelin/zeppelin-0.10.1/zeppelin-0.10.1-bin-all.tgz && \
tar -zxf zeppelin-0.10.1-bin-all.tgz && \
mv zeppelin-0.10.1-bin-all zeppelin && \
rm -rf zeppelin-0.10.1-bin-all.tgz
WORKDIR /home/zeppelin/interpreter/jdbc
RUN wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-8.0.24.tar.gz && \
tar -zxf mysql-connector-java-8.0.24.tar.gz && \
rm -rf mysql-connector-java-8.0.24.tar.gz
WORKDIR /home
# pip3 설정
RUN mkdir /root/.pip && \
set -x \
&& { \
echo '[global]'; \
echo 'timeout = 60'; \
echo '[freeze]'; \
echo 'timeout = 10'; \
echo '[list]'; \
echo 'format = columns'; \
} > /root/.pip/pip.conf && \
pip3 install --upgrade pip
# 환경설정
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
ENV SPARK_HOME /home/spark
ENV ZEPPELIN_HOME /home/zeppelin
ENV PATH $PATH:$JAVA_HOME/bin:$SPARK_HOME/bin:$ZEPPELIN_HOME/bin
# spark 설정파일 수정
COPY ./spark-cluster/spark-cluster-conf/spark-env.sh /home/spark/conf/spark-env.sh
COPY ./spark-cluster/spark-cluster-conf/log4j.properties /home/spark/conf/log4j.properties
RUN rm -rf /home/spark/conf/spark-env.sh.template && \
rm -rf /home/spark/conf/log4j.properties.template && \
rm -rf /home/spark/bin/*.cmd
# zeppelin 설정파일 수정
RUN rm -rf /home/zeppelin/conf/zeppelin-env.sh.template && \
rm -rf /home/zeppelin/conf/zeppelin-env.cmd.template && \
rm -rf /home/zeppelin/conf/interpreter.json && \
rm -rf /home/zeppelin/bin/*.cmd
COPY ./zeppelin/zeppelin-conf/zeppelin-env.sh /home/zeppelin/conf/zeppelin-env.sh
COPY ./zeppelin/zeppelin-conf/interpreter.json /home/zeppelin/conf/interpreter.json
# 컨테이너 실행시 spark 자동실행
COPY ./spark-cluster/spark-cluster-entrypoint/entrypoint-spark.sh /usr/local/bin/
ENTRYPOINT ["entrypoint-spark.sh"]
docker-compose.yml
version: '2.1'
services:
spark-master:
hostname: spark-master
container_name: spark-master
image: spark-cluster:0.03
restart: always
ports:
- 18080:18080 # 스파크 웹 UI
stdin_open: true
tty: true
environment:
- SPARK_MODE=master
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=17077
- SPARK_MASTER_WEBUI_PORT=18080
spark-slave-1:
hostname: spark-slave-1
container_name: spark-slave-1
image: spark-cluster:0.03
restart: always
stdin_open: true
tty: true
ports:
- 8081:8081 # 스파크 워커 웹 UI
environment:
- SPARK_MODE=slave
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=17077
- SPARK_MASTER_WEBUI_PORT=18080
- SPARK_WORKER_CORES=4
- SPARK_WORKER_MEMORY=4g
- SPARK_WORKER_WEBUI_PORT=8081
- SPARK_WORKER_PORT=18081
depends_on:
- spark-master
spark-slave-2:
hostname: spark-slave-2
container_name: spark-slave-2
image: spark-cluster:0.03
restart: always
stdin_open: true
tty: true
ports:
- 8082:8082 # 스파크 워커 웹 UI
environment:
- SPARK_MODE=slave
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=17077
- SPARK_MASTER_WEBUI_PORT=18080
- SPARK_WORKER_CORES=4
- SPARK_WORKER_MEMORY=4g
- SPARK_WORKER_WEBUI_PORT=8082
- SPARK_WORKER_PORT=18082
depends_on:
- spark-master
spark-slave-3:
hostname: spark-slave-3
container_name: spark-slave-3
image: spark-cluster:0.03
restart: always
stdin_open: true
tty: true
ports:
- 8083:8083 # 스파크 워커 웹 UI
environment:
- SPARK_MODE=slave
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=17077
- SPARK_MASTER_WEBUI_PORT=18080
- SPARK_WORKER_CORES=4
- SPARK_WORKER_MEMORY=4g
- SPARK_WORKER_WEBUI_PORT=8083
- SPARK_WORKER_PORT=18083
depends_on:
- spark-master
zeppelin:
hostname: zeppelin
container_name: zeppelin
image: spark-cluster:0.03
restart: always
stdin_open: true
tty: true
ports:
- 9999:9999
environment:
- SPARK_MODE=zeppelin
- SPARK_MASTER=spark://spark-master:17077
- ZEPPELIN_PORT=9999
- SPARK_APP_NAME=MyFirstZeppelin
- TOTAL_EXECUTOR_CORES=4
depends_on:
- spark-master
docker ps -a
docker images
./full_build.sh
docker-compose up -d
localhost:18080
localhost:9999
create new notebook
create
sc
localhost:18080
application
val data = sc.parallelize(1 to 100 by 1)
data.count