import pyspark from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]") \ .appName('SplitFile') \ .getOrCreate() readDF = spark.read.format("csv").option("header", True).option("delimiter", "|").load( r"C:\Users\ankus\PycharmProjects\pythonProject2\venv\resources\empdata.csv") print(readDF.count()) print(readDF.rdd.getNumPartitions()) readDF = readDF.repartition(10) print(readDF.rdd.getNumPartitions()) readDF.printSchema() readDF.repartition(4,'gender').write.csv(r'C:\Users\ankus\PycharmProjects\pythonProject2\venv\Output','overwrite') #readDF.coalesce(10).write.save(r'C:\Users\ankus\PycharmProjects\pythonProject2\venv\Output','csv','overwrite',None)
import pyspark from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]") \ .appName('SplitFile') \ .getOrCreate() rdd = spark.sparkContext.parallelize(range(0,20),6) print('no of partition is ', rdd.getNumPartitions()) rdd.saveAsTextFile(r'C:\Users\ankus\PycharmProjects\pythonProject2\venv\Output2',)
rdd1.saveAsTextFile("/tmp/partition") //Writes 6 part files, one for each partition Partition 1 : 0 1 2 Partition 2 : 3 4 5 Partition 3 : 6 7 8 9 Partition 4 : 10 11 12 Partition 5 : 13 14 15 Partition 6 : 16 17 18 19
rdd3 = rdd1.coalesce(4) printl("Repartition size : "+rdd3.partitions.size) rdd3.saveAsTextFile("/tmp/coalesce") Partition 1 : 0 1 2 Partition 2 : 3 4 5 6 7 8 9 Partition 4 : 10 11 12 Partition 5 : 13 14 15 16 17 18 19
df2 = df.repartition(6) printl(df2.rdd.partitions.length) Partition 1 : 14 1 5 Partition 2 : 4 16 15 Partition 3 : 8 3 18 Partition 4 : 12 2 19 Partition 5 : 6 17 7 0 Partition 6 : 9 10 11 13