Pyspark RDD examples
from pyspark.sql import SparkSession sc = SparkSession.builder.master(‘local[1]’)\ .appName(‘RDD_Methods_Examples’)\ .getOrCreate() print(sc.version) Output: 3.2.1 rddNum = sc.parallelize([1,2,3,4,5,6,7,8,9,10]) rddNum = rddNum.map(lambda x : x+10) rddNum = rddNum.filter(lambda x : x % 2 == 0) print(rddNum.reduce(lambda a,b : a+b)) Output: 80 nameRdd = sc.parallelize([‘Deepak’,’Simmi’,’Simran’,’Sukhwinder’,’Sanki’,’ShotTemper’]) rddNum = nameRdd.filter(lambda name : name.startswith(‘S’)) print(rddNum.collect()) rddNum = nameRdd.filter(lambda name : not name.startswith(‘S’)) print(rddNum.collect()) ['Simmi', 'Simran', 'Sukhwinder', 'Sanki', 'ShotTemper']['Deepak'] #union example rddNum = sc.parallelize([1,2,3,4,5,6,7,8,9,10,30,21,45,23,22,77,44]) divisibleByTwo = rddNum.filter(lambda x : x%2 == 0) divisibleByThree = rddNum.filter(lambda x : x%3 == 0) print(divisibleByTw...