Data Engineering/Spark
2023.01.14
다음과 같은 에러가 발생 Traceback (most recent call last): File "df_schema_null.py", line 23, in df = spark.createDataFrame(data = data, schema = schema) File "/Users/pgt0409/opt/anaconda3/envs/py38/lib/python3.8/site-packages/pyspark/sql/session.py", line 894, in createDataFrame return self._create_dataframe( File "/Users/pgt0409/opt/anaconda3/envs/py38/lib/python3.8/site-packages/pyspark/sql/session.py"..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ('kim', 100), ('kim', 90), ('lee', 80), ('lee', 70), ('park', 60) ] schema = ['name', 'score'] df = spark.createDataFrame(data = data, schema = schema) df.printSchema() df.show..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ('kim', 100), ('kim', 90), ('lee', 80), ('lee', 70), ('park', 60) ] schema = StructType([ \ StructField('name', StringType(), True), \ StructField('score', IntegerType(), True)..
Data Engineering/Spark
2023.01.14
+-------------------------------------------------------------+---------+-------------+ | Code | 100,000 | 100,000,000 | +-------------------------------------------------------------+---------+-------------+ | df.select("col_name").rdd.flatMap(lambda x: x).collect() | 0.4 | 55.3 | | list(df.select('col_name').toPandas()['col_name']) | 0.4 | 17.5 | | df.select('col_name').rdd.map(lambda row : ro..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType from pyspark.sql.functions import col spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ('kim', 100), ('kim', 90), ('lee', 80), ('lee', 70), ('park', 60) ] schema = StructType([ \ StructField('name', StringType(), True), \ Str..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType from pyspark.sql.functions import col spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ('kim', 100), ('kim', 90), ('lee', 80), ('lee', 70), ('park', 60) ] schema = StructType([ \ StructField('name', StringType(), True), \ Str..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ('kim', 'a', 100), ('kim', 'a', 90), ('lee', 'a', 80), ('lee', 'b', 70), ('park', 'b', 60) ] schema = StructType([ \ StructField('name', StringType(), True), \ StructField('cla..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [ ("kim", 100), ("kim", 90), ("lee", 80), ("lee", 70), ('park', 60) ] schema = StructType([ \ StructField('name', StringType(),True), \ StructField('score', IntegerType(),True) ]..
Data Engineering/Spark
2023.01.14
from pyspark.sql import SparkSession, Row spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() data = [Row(id = 0, name = 'park', score = 100), Row(id = 1, name = 'lee', score = 90), Row(id = 2, name = 'kim', score = 80)] df = spark.createDataFrame(data) df.show()
Data Engineering/Spark
2023.01.14
import pandas as pd from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .master('local') \ .appName('my_pyspark_app') \ .getOrCreate() df_pandas = pd.DataFrame({ 'id': [0, 1, 2, 3, 4], 'name': ['kim', 'kim', 'park', 'park', 'lee'], 'score': [100, 90, 80, 70, 60] }) df_spark = spark.createDataFrame(df_pandas) print(df_pandas) df_spark.show()