Workspace
Javier Canales Luna/

pandas 2.0 vs polars

0
Beta
Spinner

Pandas 2.0 vs polars performance testing for data manipulation

Install and import libraries

%%capture
!pip install polars
!pip install pandas==2.0.0
Hidden output
import numpy as np
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from string import ascii_letters
import random

sns.set()

Create synthetic dataset

def create_table(n):
    np.random.seed(42)
    # Create dataframe with sales information
    df = pd.DataFrame({'id' : [''.join(random.choice(ascii_letters) for x in range(10)) for _ in range(n)],
                       'date' :pd.date_range(start='1980-01-01', periods=n, freq='T'),
                       'office' : np.random.choice(['United States',
                                                    'Brasil','Spain',
                                                    'France','China',
                                                    'United Kingdom',
                                                    'Italy','Canada',
                                                    'India','Argentina'],size=n).astype(str),
                       'sales' : np.random.randint(0,10000, size= n).astype(np.int32),
                      'revenue' : np.random.randint(0,10000, size= n).astype(np.int32)})
    
    # Add some random nan values
   
    cols_list = df.select_dtypes('number').columns.tolist()
        
    for col in df[cols_list]:
        df.loc[df.sample(frac=0.05).index, col] = np.nan
                    
    return df
def create_table_2(n):
    # Fictional databse with transactions in Italy office during 2022

    np.random.seed(42)
    df = pd.DataFrame({'id' : italy_2022_id,
                       'year':2022,
                       'name' : [''.join(random.choice(ascii_letters) for x in range(4)) for _ in range(n)],
                       'surname' : [''.join(random.choice(ascii_letters) for x in range(6)) for _ in range(n)],
                       'responsibility' : np.random.choice(['Sales Director', 
                                                            'Sales Manager',
                                                            'Sales Intern'],size=n).astype(str), 
                       'sex' :  np.random.choice(['male','female','non-binary'], size =n).astype(str)})
                       
    return df
    
df = create_table(22616640)
#df.to_csv('example.csv',index=False)
 
Run cancelled
italy_2022_id = df_pd.query('office == "Italy" and date.dt.year == 2022')['id'].values
italy_2022 = create_table_2(52568)
italy_2022_pl = pl.DataFrame(italy_2022)

Plotting functions

def reading_comparison(pd_time, pd_pyarrow_time, pl_time, title):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
    times = [pd_time, pd_pyarrow_time, pl_time]
    times = np.round(times, 2)

    sns.barplot(x = ['pandas (numpy)','pandas (pyarrow)','polars'], 
                y = times, 
                edgecolor='black')


    ax.set_title(f"{title} Test")
    ax.set_ylabel("Running time (seconds)")
    ax.bar_label(ax.containers[0])
    plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False,  facecolor='white', bbox_inches="tight")
    plt.show()
def plot_comparison(pd_time, pl_time, title):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
    times = [pd_time, pl_time]
    times = np.round(times, 2)

    sns.barplot(x = ['pandas','polars'], 
                y = times, 
                edgecolor='black')


    ax.set_title(f"{title} Test")
    ax.set_ylabel("Running time (seconds)")
    ax.bar_label(ax.containers[0])
    plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False,  facecolor='white', bbox_inches="tight")
    plt.show()

Reading data test

s = time.time()
df_pd = pd.read_csv("./example.csv")
df_pd = df_pd[['id', 'date', 'office', 'sales']]
df_pd.query('office =="France"')
e = time.time()
pd_time= e-s
print("pandas Loading Time = {}".format(pd_time))
s = time.time()
df_pd_arrow = pd.read_csv("./example.csv", engine="pyarrow")
df_pd_arrow= df_pd_arrow[['id', 'date', 'office', 'sales']]
df_pd_arrow.query('office == "France"')
e = time.time()
pd_pyarrow_time= e-s

print("pandas pyarrow Loading Time = {}".format(pd_pyarrow_time))



  • AI Chat
  • Code