 
			 
			





 
			 
															
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when
def getsparkSession():
    spark = SparkSession.builder.master("yarn") \
        .appName('Learnomate Example') \
        .getOrCreate()
    return spark
spark = getsparkSession()
origin_df = spark.read.format('csv').option('header', 'True').option('delimiter', '|') \
    .load(r"C:\Users\ankus\PycharmProjects\pythonProject2\venv\resources\empdata.csv")
origin_df.show()
df = spark.read.format('parquet').load(r"C:\Users\ankus\PycharmProjects\pythonProject2\venv\resources\Train.parquet")
df.show()
df = spark.read.format('avro').load(r"C:\Users\ankus\PycharmProjects\pythonProject2\venv\resources\variants.avro")
df.show()
Read Data from HDFS 
origin_df = spark.read.format('csv').option('header', 'True').option('delimiter', '|') \
    .load("hdfs://sandbox-hdp.hortonworks.com:8020/input/empdata.csv")
origin_df.show()