Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext line_1 = 'i love you' line_2 = 'you are my friend' line_3 = 'my name is park' lines = sc.parallelize([line_1.upper(), line_2.upper(), line_3.upper()]) lines_map = lines.map(lambda x: x.lower().split(' ')) lines_flatmap = lines.flatMap(lambda x: x.lower().split('..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext line_1 = 'i love you' line_2 = 'you are my friend' line_3 = 'my name is park' lines = sc.parallelize([line_1, line_2, line_3]) lines_map = lines.map(lambda x: x.split(' ')) lines_flatmap = lines.flatMap(lambda x: x.split(' ')) print(f'lines.collect() : {lines.co..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext data = sc.parallelize(range(0, 10)) data_map = data.map(lambda x: x * x) data_reduce = data.reduce(lambda x, y: x + y) data_map_reduce = data_map.reduce(lambda x, y: x + y) print(f'data : {data.collect()}') print(f'data_map : {data_map.collect()}') print(f'data_..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession import numpy as np import pandas as pd spark = SparkSession \ .builder \ .appName("1_test_dataframe") \ .getOrCreate() df_pandas = pd.DataFrame(np.random.rand(100, 3)) df_spark = spark.createDataFrame(df_pandas) df_re_pandas = df_spark.select("*").toPandas() print(df_pandas.head(5)) print(df_spark.show(5)) print(df_re_pandas.head(5)) spark.stop() 결과
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() schema = StructType([StructField("first", StringType(), True), StructField("second", StringType(), True), StructField("third", StringType(), True)]) df = spark.read.csv('/home/spark/result/1_test_dataframe.csv', header =..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession from pyspark.sql import Row if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext data_1 = [("kim", 1), ("park", 2), ("choi", 3)] data_2 = [Row(name='kim', age=5, height=80), Row(name='park', age=5, height=80), Row(name='choi', age=10, height=80)] df_1 = sc.parallelize(data_1).toDF() df_2..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext lines = sc.parallelize(['i love you', 'you are my friend', 'my name is park']) print(f'lines : {lines.collect()}' ) # pairs = lines.map(lambda s: s.split(" ")) # 띄어쓰기로 나누기 # pairs = pairs.filter(lambda x: len(x) > 3) # 단어의 개수가 3개를 초과하는..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext lines = sc.parallelize(['i love you', 'you are my friend', 'my name is park']) print(f'lines : {lines.collect()}' ) # pairs = lines.map(lambda s: s.split(" ")) # pairs = pairs.filter(lambda x: len(x) > 3) pairs = lines.flatMap(lambda x..
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext lines = sc.parallelize(['i love you', 'you are my friend', 'my name is park']) print(lines.collect()) pairs = lines.map(lambda s: s.split(" ")) pairs = pairs.filter(lambda x: len(x) > 3) print(pairs.collect()) spark.stop() 결과
Data Engineering/Spark
2022.05.29
코드 from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("0_save_file")\ .getOrCreate() sc = spark.sparkContext lines = sc.parallelize(['i love you', 'you are my friend', 'my name is park']) print(lines.collect()) pairs = lines.map(lambda s: s.split(" ")) print(pairs.collect()) spark.stop() 결과