Introduction to Spark

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \\
    .appName("Spark Intro") \\
    .getOrCreate()

# Perform basic operations
data = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(data)
count = rdd.count()

print("Count: ", count)

Spark Architecture

from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext(appName="RDD Example")

# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Perform operations on RDD
squared_rdd = rdd.map(lambda x: x ** 2)
result = squared_rdd.collect()

print("Squared RDD: ", result)

RDD (Resilient Distributed Datasets)