hadoop - Connect spark session on Jupyter - Stack Overflow

docker-compose.ymlservices:namenode:image: bde2020hadoop-namenode:2.0.0-hadoop3.2.1-java8container_na

docker-compose.yml

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
    container_name: namenode
    restart: always
    ports:
      - 9870:9870
      - 9000:9000
    environment:
      - CLUSTER_NAME=hadoop_cluster
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    volumes:
      - hadoop_namenode:/hadoop/dfs/name
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9870"]
      interval: 30s
      timeout: 10s
      retries: 3

  datanode:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
    container_name: datanode
    restart: always
    ports:
      - 9864:9864
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    volumes:
      - hadoop_datanode:/hadoop/dfs/data
    depends_on:
      - namenode

  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
    container_name: resourcemanager
    restart: always
    ports:
      - 8088:8088
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
      - YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
    depends_on:
      - namenode

  nodemanager:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
    container_name: nodemanager
    restart: always
    ports:
      - 8042:8042
    environment:
      - CORE_CONF_fs_defaultFS=hdfs://namenode:9000
      - YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
      - YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
    depends_on:
      - resourcemanager

  spark-master:
    image: bitnami/spark:3.5
    container_name: spark-master
    environment:
      - SPARK_MODE=master
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    ports:
      - "8080:8080"
      - "7077:7077"
    volumes:
      - spark_data:/bitnami/spark

  spark-worker:
    image: bitnami/spark:3.5
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=1
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    # ports:
    #   - "8081-8084:8081"
    volumes:
      - spark_data:/bitnami/spark
    depends_on:
      - spark-master
    deploy:
      replicas: 2

  jupyter:
    image: jupyter/pyspark-notebook:latest
    container_name: jupyter
    ports:
      - "8888:8888"
    environment:
      - JUPYTER_ENABLE_LAB=yes
      - SPARK_MASTER=spark://spark-master:7077
    volumes:
      - jupyter_data:/home/jovyan/work
      - ./notebooks:/home/jovyan/work/notebooks
    depends_on:
      - spark-master

volumes:
  hadoop_namenode:
  hadoop_datanode:
  spark_data:
  jupyter_data:

It's fine when I try to create spark dataframe then write to hdfs and read back from hdfs via spark-shell

First all, I create a directory test in hdfs then set permissions to 777 access

hdfs command

hdfs dfs -mkdir /test/employees
hdfs dfs -chmod -R 777 /test

spark-shell

// First, create a dummy dataframe
val data = Seq(
  (1, "John", 30, "New York"),
  (2, "Alice", 25, "San Francisco"),
  (3, "Bob", 35, "Chicago"),
  (4, "Carol", 28, "Boston"),
  (5, "David", 40, "Seattle")
)

// Define the schema
val columns = Seq("id", "name", "age", "city")

// Create the DataFrame
val df = spark.createDataFrame(data).toDF(columns: _*)

// Show the DataFrame
df.show()

// Write to HDFS
df.write
  .mode("overwrite")
  .parquet("hdfs://namenode:9000/test/employees")

// Read back from HDFS
val dfRead = spark.read
  .parquet("hdfs://namenode:9000/test/employees")

// Show the read DataFrame
println("\nData read back from HDFS:")
dfRead.show()

// Perform some basic analysis
println("\nBasic statistics:")
dfRead.describe("age").show()

println("\nCount by city:")
dfRead.groupBy("city").count().show()

But When I try to do same thing in jupyter it can not create spark dataframe even though spark session successfully connected.

Install pyspark package pip install pyspark

jupyter notebook

from pyspark.sql import SparkSession

# Create Spark session with proper configuration
spark = SparkSession.builder \
    .appName("JupyterTest") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.host", "jupyter") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

# Create test DataFrame
data = [("John", 30), ("Alice", 25), ("Bob", 35)]
df = spark.createDataFrame(data, ["name", "age"])

# Show DataFrame
print("Original DataFrame:")
df.show()

# Write to HDFS
df.write.mode("overwrite").parquet("hdfs://namenode:9000/test/people")

# Read back from HDFS
df_read = spark.read.parquet("hdfs://namenode:9000/test/people")
print("\nData read from HDFS:")
df_read.show()

Is something misssing about my config spark session?

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745274903a4619978.html

相关推荐

  • hadoop - Connect spark session on Jupyter - Stack Overflow

    docker-compose.ymlservices:namenode:image: bde2020hadoop-namenode:2.0.0-hadoop3.2.1-java8container_na

    18小时前
    30

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信