Strings y Fechas

In [5]:

Copied!

%%sh
pip install pyspark
%%sh
pip install pyspark

Requirement already satisfied: pyspark in /home/ubuntu/yes/lib/python3.12/site-packages (3.5.4)
Requirement already satisfied: py4j==0.10.9.7 in /home/ubuntu/yes/lib/python3.12/site-packages (from pyspark) (0.10.9.7)

In [1]:

Copied!





# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Working with Strings & Dates")
    .master("local[*]")
    .getOrCreate()
)

spark
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Working with Strings & Dates")
    .master("local[*]")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/15 09:48:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/15 09:48:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.

Out[1]:

SparkSession - in-memory

SparkContext

Spark UI

Version: v3.5.4
Master: local[*]
AppName: Working with Strings & Dates

In [2]:

Copied!





# Emp Data & Schema

emp_data = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"],
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"
# Emp Data & Schema

emp_data = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"],
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [6]:

Copied!

# Create emp DataFrame

emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

# Create emp DataFrame

emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [7]:

Copied!

from pyspark.sql.functions import when, col, expr

emp_gender_fixed = emp.withColumn("gender",expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end"))
from pyspark.sql.functions import when, col, expr

emp_gender_fixed = emp.withColumn("gender",expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end"))

In [8]:

Copied!

emp_gender_fixed.show()
emp_gender_fixed.show()

+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|        001|          101|     John Doe| 30|     M| 50000|2015-01-01|
|        002|          101|   Jane Smith| 25|     F| 45000|2016-02-15|
|        003|          102|    Bob Brown| 35|     M| 55000|2014-05-01|
|        004|          102|    Alice Lee| 28|     F| 48000|2017-09-30|
|        005|          103|    Jack Chan| 40|     M| 60000|2013-04-01|
|        006|          103|    Jill Wong| 32|     F| 52000|2018-07-01|
|        007|          101|James Johnson| 42|     M| 70000|2012-03-15|
|        008|          102|     Kate Kim| 29|     F| 51000|2019-10-01|
|        009|          103|      Tom Tan| 33|     M| 58000|2016-06-01|
|        010|          104|     Lisa Lee| 27|     F| 47000|2018-08-01|
|        011|          104|   David Park| 38|     M| 65000|2015-11-01|
|        012|          105|   Susan Chen| 31|     F| 54000|2017-02-15|
|        013|          106|    Brian Kim| 45|     M| 75000|2011-07-01|
|        014|          107|    Emily Lee| 26|     F| 46000|2019-01-01|
|        015|          106|  Michael Lee| 37|     M| 63000|2014-09-30|
|        016|          107|  Kelly Zhang| 30|     F| 49000|2018-04-01|
|        017|          105|  George Wang| 34|     M| 57000|2016-03-15|
|        018|          104|    Nancy Liu| 29|  NULL| 50000|2017-06-01|
|        019|          103|  Steven Chen| 36|     M| 62000|2015-08-01|
|        020|          102|    Grace Kim| 32|     F| 53000|2018-11-01|
+-----------+-------------+-------------+---+------+------+----------+

In [10]:

Copied!

from pyspark.sql.functions import regexp_replace

emp_name_fixed = emp_gender_fixed.withColumn("new_name", regexp_replace(col("name"), "J", "Z"))
from pyspark.sql.functions import regexp_replace

emp_name_fixed = emp_gender_fixed.withColumn("new_name", regexp_replace(col("name"), "J", "Z"))

In [11]:

Copied!

# Convert Date
# select *,  to_date(hire_date, 'YYYY-MM-DD') as hire_date from emp_name_fixed
from pyspark.sql.functions import to_date

emp_date_fix = emp_name_fixed.withColumn("hire_date", to_date(col("hire_date"), 'yyyy-MM-dd'))
# Convert Date
# select *,  to_date(hire_date, 'YYYY-MM-DD') as hire_date from emp_name_fixed
from pyspark.sql.functions import to_date

emp_date_fix = emp_name_fixed.withColumn("hire_date", to_date(col("hire_date"), 'yyyy-MM-dd'))

In [12]:

Copied!

emp_date_fix.printSchema()
emp_date_fix.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- new_name: string (nullable = true)

In [13]:

Copied!

# Add Date Columns
# Add current_date, current_timestamp, extract year from hire_date
from pyspark.sql.functions import current_date, current_timestamp

emp_dated = emp_date_fix.withColumn("date_now", current_date()).withColumn("timestamp_now", current_timestamp())
# Add Date Columns
# Add current_date, current_timestamp, extract year from hire_date
from pyspark.sql.functions import current_date, current_timestamp

emp_dated = emp_date_fix.withColumn("date_now", current_date()).withColumn("timestamp_now", current_timestamp())

In [14]:

Copied!

emp_dated.show(truncate=False)
emp_dated.show(truncate=False)

+-----------+-------------+-------------+---+------+------+----------+-------------+----------+-----------------------+
|employee_id|department_id|name         |age|gender|salary|hire_date |new_name     |date_now  |timestamp_now          |
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+-----------------------+
|001        |101          |John Doe     |30 |M     |50000 |2015-01-01|Zohn Doe     |2025-02-15|2025-02-15 09:50:17.729|
|002        |101          |Jane Smith   |25 |F     |45000 |2016-02-15|Zane Smith   |2025-02-15|2025-02-15 09:50:17.729|
|003        |102          |Bob Brown    |35 |M     |55000 |2014-05-01|Bob Brown    |2025-02-15|2025-02-15 09:50:17.729|
|004        |102          |Alice Lee    |28 |F     |48000 |2017-09-30|Alice Lee    |2025-02-15|2025-02-15 09:50:17.729|
|005        |103          |Jack Chan    |40 |M     |60000 |2013-04-01|Zack Chan    |2025-02-15|2025-02-15 09:50:17.729|
|006        |103          |Jill Wong    |32 |F     |52000 |2018-07-01|Zill Wong    |2025-02-15|2025-02-15 09:50:17.729|
|007        |101          |James Johnson|42 |M     |70000 |2012-03-15|Zames Zohnson|2025-02-15|2025-02-15 09:50:17.729|
|008        |102          |Kate Kim     |29 |F     |51000 |2019-10-01|Kate Kim     |2025-02-15|2025-02-15 09:50:17.729|
|009        |103          |Tom Tan      |33 |M     |58000 |2016-06-01|Tom Tan      |2025-02-15|2025-02-15 09:50:17.729|
|010        |104          |Lisa Lee     |27 |F     |47000 |2018-08-01|Lisa Lee     |2025-02-15|2025-02-15 09:50:17.729|
|011        |104          |David Park   |38 |M     |65000 |2015-11-01|David Park   |2025-02-15|2025-02-15 09:50:17.729|
|012        |105          |Susan Chen   |31 |F     |54000 |2017-02-15|Susan Chen   |2025-02-15|2025-02-15 09:50:17.729|
|013        |106          |Brian Kim    |45 |M     |75000 |2011-07-01|Brian Kim    |2025-02-15|2025-02-15 09:50:17.729|
|014        |107          |Emily Lee    |26 |F     |46000 |2019-01-01|Emily Lee    |2025-02-15|2025-02-15 09:50:17.729|
|015        |106          |Michael Lee  |37 |M     |63000 |2014-09-30|Michael Lee  |2025-02-15|2025-02-15 09:50:17.729|
|016        |107          |Kelly Zhang  |30 |F     |49000 |2018-04-01|Kelly Zhang  |2025-02-15|2025-02-15 09:50:17.729|
|017        |105          |George Wang  |34 |M     |57000 |2016-03-15|George Wang  |2025-02-15|2025-02-15 09:50:17.729|
|018        |104          |Nancy Liu    |29 |NULL  |50000 |2017-06-01|Nancy Liu    |2025-02-15|2025-02-15 09:50:17.729|
|019        |103          |Steven Chen  |36 |M     |62000 |2015-08-01|Steven Chen  |2025-02-15|2025-02-15 09:50:17.729|
|020        |102          |Grace Kim    |32 |F     |53000 |2018-11-01|Grace Kim    |2025-02-15|2025-02-15 09:50:17.729|
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+-----------------------+

In [15]:

Copied!

# Drop Null gender records
emp_1 = emp_dated.na.drop()
# Drop Null gender records
emp_1 = emp_dated.na.drop()

In [16]:

Copied!

emp_1.show()
emp_1.show()

+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|     new_name|  date_now|       timestamp_now|
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+
|        001|          101|     John Doe| 30|     M| 50000|2015-01-01|     Zohn Doe|2025-02-15|2025-02-15 09:51:...|
|        002|          101|   Jane Smith| 25|     F| 45000|2016-02-15|   Zane Smith|2025-02-15|2025-02-15 09:51:...|
|        003|          102|    Bob Brown| 35|     M| 55000|2014-05-01|    Bob Brown|2025-02-15|2025-02-15 09:51:...|
|        004|          102|    Alice Lee| 28|     F| 48000|2017-09-30|    Alice Lee|2025-02-15|2025-02-15 09:51:...|
|        005|          103|    Jack Chan| 40|     M| 60000|2013-04-01|    Zack Chan|2025-02-15|2025-02-15 09:51:...|
|        006|          103|    Jill Wong| 32|     F| 52000|2018-07-01|    Zill Wong|2025-02-15|2025-02-15 09:51:...|
|        007|          101|James Johnson| 42|     M| 70000|2012-03-15|Zames Zohnson|2025-02-15|2025-02-15 09:51:...|
|        008|          102|     Kate Kim| 29|     F| 51000|2019-10-01|     Kate Kim|2025-02-15|2025-02-15 09:51:...|
|        009|          103|      Tom Tan| 33|     M| 58000|2016-06-01|      Tom Tan|2025-02-15|2025-02-15 09:51:...|
|        010|          104|     Lisa Lee| 27|     F| 47000|2018-08-01|     Lisa Lee|2025-02-15|2025-02-15 09:51:...|
|        011|          104|   David Park| 38|     M| 65000|2015-11-01|   David Park|2025-02-15|2025-02-15 09:51:...|
|        012|          105|   Susan Chen| 31|     F| 54000|2017-02-15|   Susan Chen|2025-02-15|2025-02-15 09:51:...|
|        013|          106|    Brian Kim| 45|     M| 75000|2011-07-01|    Brian Kim|2025-02-15|2025-02-15 09:51:...|
|        014|          107|    Emily Lee| 26|     F| 46000|2019-01-01|    Emily Lee|2025-02-15|2025-02-15 09:51:...|
|        015|          106|  Michael Lee| 37|     M| 63000|2014-09-30|  Michael Lee|2025-02-15|2025-02-15 09:51:...|
|        016|          107|  Kelly Zhang| 30|     F| 49000|2018-04-01|  Kelly Zhang|2025-02-15|2025-02-15 09:51:...|
|        017|          105|  George Wang| 34|     M| 57000|2016-03-15|  George Wang|2025-02-15|2025-02-15 09:51:...|
|        019|          103|  Steven Chen| 36|     M| 62000|2015-08-01|  Steven Chen|2025-02-15|2025-02-15 09:51:...|
|        020|          102|    Grace Kim| 32|     F| 53000|2018-11-01|    Grace Kim|2025-02-15|2025-02-15 09:51:...|
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+

In [19]:

Copied!

# Fix Null values
# select *, nvl('new_gender', 'O') as new_gender from emp_dated
from pyspark.sql.functions import coalesce, lit

emp_null_df = emp_1.withColumn("new_gender", coalesce(col("gender"), lit("O")))
# Fix Null values
# select *, nvl('new_gender', 'O') as new_gender from emp_dated
from pyspark.sql.functions import coalesce, lit

emp_null_df = emp_1.withColumn("new_gender", coalesce(col("gender"), lit("O")))

In [20]:

Copied!

emp_null_df.show()
emp_null_df.show()

+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|     new_name|  date_now|       timestamp_now|new_gender|
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+----------+
|        001|          101|     John Doe| 30|     M| 50000|2015-01-01|     Zohn Doe|2025-02-15|2025-02-15 09:52:...|         M|
|        002|          101|   Jane Smith| 25|     F| 45000|2016-02-15|   Zane Smith|2025-02-15|2025-02-15 09:52:...|         F|
|        003|          102|    Bob Brown| 35|     M| 55000|2014-05-01|    Bob Brown|2025-02-15|2025-02-15 09:52:...|         M|
|        004|          102|    Alice Lee| 28|     F| 48000|2017-09-30|    Alice Lee|2025-02-15|2025-02-15 09:52:...|         F|
|        005|          103|    Jack Chan| 40|     M| 60000|2013-04-01|    Zack Chan|2025-02-15|2025-02-15 09:52:...|         M|
|        006|          103|    Jill Wong| 32|     F| 52000|2018-07-01|    Zill Wong|2025-02-15|2025-02-15 09:52:...|         F|
|        007|          101|James Johnson| 42|     M| 70000|2012-03-15|Zames Zohnson|2025-02-15|2025-02-15 09:52:...|         M|
|        008|          102|     Kate Kim| 29|     F| 51000|2019-10-01|     Kate Kim|2025-02-15|2025-02-15 09:52:...|         F|
|        009|          103|      Tom Tan| 33|     M| 58000|2016-06-01|      Tom Tan|2025-02-15|2025-02-15 09:52:...|         M|
|        010|          104|     Lisa Lee| 27|     F| 47000|2018-08-01|     Lisa Lee|2025-02-15|2025-02-15 09:52:...|         F|
|        011|          104|   David Park| 38|     M| 65000|2015-11-01|   David Park|2025-02-15|2025-02-15 09:52:...|         M|
|        012|          105|   Susan Chen| 31|     F| 54000|2017-02-15|   Susan Chen|2025-02-15|2025-02-15 09:52:...|         F|
|        013|          106|    Brian Kim| 45|     M| 75000|2011-07-01|    Brian Kim|2025-02-15|2025-02-15 09:52:...|         M|
|        014|          107|    Emily Lee| 26|     F| 46000|2019-01-01|    Emily Lee|2025-02-15|2025-02-15 09:52:...|         F|
|        015|          106|  Michael Lee| 37|     M| 63000|2014-09-30|  Michael Lee|2025-02-15|2025-02-15 09:52:...|         M|
|        016|          107|  Kelly Zhang| 30|     F| 49000|2018-04-01|  Kelly Zhang|2025-02-15|2025-02-15 09:52:...|         F|
|        017|          105|  George Wang| 34|     M| 57000|2016-03-15|  George Wang|2025-02-15|2025-02-15 09:52:...|         M|
|        019|          103|  Steven Chen| 36|     M| 62000|2015-08-01|  Steven Chen|2025-02-15|2025-02-15 09:52:...|         M|
|        020|          102|    Grace Kim| 32|     F| 53000|2018-11-01|    Grace Kim|2025-02-15|2025-02-15 09:52:...|         F|
+-----------+-------------+-------------+---+------+------+----------+-------------+----------+--------------------+----------+

In [21]:

Copied!

# Drop old columns and Fix new column names
emp_final = emp_null_df.drop("name", "gender").withColumnRenamed("new_name", "name").withColumnRenamed("new_gender", "gender")
# Drop old columns and Fix new column names
emp_final = emp_null_df.drop("name", "gender").withColumnRenamed("new_name", "name").withColumnRenamed("new_gender", "gender")

In [23]:

Copied!

from pyspark.sql.functions import date_format
emp_fixed = emp_final.withColumn("date_string",date_format(col("hire_date"),"dd/MM/yyyy"))
from pyspark.sql.functions import date_format
emp_fixed = emp_final.withColumn("date_string",date_format(col("hire_date"),"dd/MM/yyyy"))

In [25]:

Copied!

emp_fixed.show()
emp_fixed.show()

+-----------+-------------+---+------+----------+-------------+----------+--------------------+------+-----------+
|employee_id|department_id|age|salary| hire_date|         name|  date_now|       timestamp_now|gender|date_string|
+-----------+-------------+---+------+----------+-------------+----------+--------------------+------+-----------+
|        001|          101| 30| 50000|2015-01-01|     Zohn Doe|2025-02-15|2025-02-15 09:55:...|     M| 01/01/2015|
|        002|          101| 25| 45000|2016-02-15|   Zane Smith|2025-02-15|2025-02-15 09:55:...|     F| 15/02/2016|
|        003|          102| 35| 55000|2014-05-01|    Bob Brown|2025-02-15|2025-02-15 09:55:...|     M| 01/05/2014|
|        004|          102| 28| 48000|2017-09-30|    Alice Lee|2025-02-15|2025-02-15 09:55:...|     F| 30/09/2017|
|        005|          103| 40| 60000|2013-04-01|    Zack Chan|2025-02-15|2025-02-15 09:55:...|     M| 01/04/2013|
|        006|          103| 32| 52000|2018-07-01|    Zill Wong|2025-02-15|2025-02-15 09:55:...|     F| 01/07/2018|
|        007|          101| 42| 70000|2012-03-15|Zames Zohnson|2025-02-15|2025-02-15 09:55:...|     M| 15/03/2012|
|        008|          102| 29| 51000|2019-10-01|     Kate Kim|2025-02-15|2025-02-15 09:55:...|     F| 01/10/2019|
|        009|          103| 33| 58000|2016-06-01|      Tom Tan|2025-02-15|2025-02-15 09:55:...|     M| 01/06/2016|
|        010|          104| 27| 47000|2018-08-01|     Lisa Lee|2025-02-15|2025-02-15 09:55:...|     F| 01/08/2018|
|        011|          104| 38| 65000|2015-11-01|   David Park|2025-02-15|2025-02-15 09:55:...|     M| 01/11/2015|
|        012|          105| 31| 54000|2017-02-15|   Susan Chen|2025-02-15|2025-02-15 09:55:...|     F| 15/02/2017|
|        013|          106| 45| 75000|2011-07-01|    Brian Kim|2025-02-15|2025-02-15 09:55:...|     M| 01/07/2011|
|        014|          107| 26| 46000|2019-01-01|    Emily Lee|2025-02-15|2025-02-15 09:55:...|     F| 01/01/2019|
|        015|          106| 37| 63000|2014-09-30|  Michael Lee|2025-02-15|2025-02-15 09:55:...|     M| 30/09/2014|
|        016|          107| 30| 49000|2018-04-01|  Kelly Zhang|2025-02-15|2025-02-15 09:55:...|     F| 01/04/2018|
|        017|          105| 34| 57000|2016-03-15|  George Wang|2025-02-15|2025-02-15 09:55:...|     M| 15/03/2016|
|        019|          103| 36| 62000|2015-08-01|  Steven Chen|2025-02-15|2025-02-15 09:55:...|     M| 01/08/2015|
|        020|          102| 32| 53000|2018-11-01|    Grace Kim|2025-02-15|2025-02-15 09:55:...|     F| 01/11/2018|
+-----------+-------------+---+------+----------+-------------+----------+--------------------+------+-----------+

In [ ]: