Skip to content

Usage

Here are some additional examples of using viper to analyze the mtcars dataset.

Single DataFrame

Find the Mercedes models present in the dataset, convert their weight to tonnes, and sort the results by horsepower:

import viper as v
from viper.data import mtcars

df = v.pipeline(
    mtcars,
    v.filter(lambda r: "Merc" in r["model"]),
    v.select("model", "hp", "wt"),
    v.mutate(wt=lambda r: r["wt"] * 0.45359),
    v.arrange("hp desc")
)
df
#>           model   hp        wt
#> 11   Merc 450SE  180  1.846111
#> 12   Merc 450SL  180  1.691891
#> 13  Merc 450SLC  180  1.714570
#> 9      Merc 280  123  1.560350
#> 10    Merc 280C  123  1.560350
#> 8      Merc 230   95  1.428808
#> 7     Merc 240D   62  1.446952

Determine the frequency table of car manufacturers that have at least two models in the dataset:

df = v.pipeline(
    mtcars,
    v.mutate(
        producer=lambda r: r["model"].str.split(' ', expand = True)[0]
    ),
    v.group_by("producer"),
    v.summarize("producer = size()"),
    v.filter(lambda r: r["producer"] >= 2)
)
df
#>           producer
#> producer
#> Fiat             2
#> Hornet           2
#> Mazda            2
#> Merc             7
#> Toyota           2

Two DataFrames

To demonstrate the use of the left_join function, we will calculate the z-scores of the horsepower variable with respect to the population grouped by the number of cylinders, using only viper:

df_metrics = v.pipeline(
    mtcars,
    v.mutate(
        hp_mean=lambda r: r["hp"],
        hp_std=lambda r: r["hp"]
    ),
    v.group_by("cyl"),
    v.summarize(
        "hp_mean = mean()",
        "hp_std = std()"
    )
)
df_metrics
#>         hp_mean     hp_std
#> cyl
#> 4     82.636364  20.934530
#> 6    122.285714  24.260491
#> 8    209.214286  50.976886

df_zscores = v.pipeline(
    mtcars,
    v.select("model", "cyl", "hp"),
    v.left_join(
        df_metrics,
        by = "cyl"
    ),
    v.mutate(
        zscore = lambda r: (r["hp"] - r["hp_mean"]) / r["hp_std"]
    )
)
df_zscores.head()
#>                model  cyl   hp     hp_mean     hp_std    zscore
#> 0          Mazda RX4    6  110  122.285714  24.260491 -0.506408
#> 1      Mazda RX4 Wag    6  110  122.285714  24.260491 -0.506408
#> 2         Datsun 710    4   93   82.636364  20.934530  0.495050
#> 3     Hornet 4 Drive    6  110  122.285714  24.260491 -0.506408
#> 4  Hornet Sportabout    8  175  209.214286  50.976886 -0.671173

The anti_join function is a type of filtering join that can be used to remove from the current dataset any rows that match a row in another dataset. In this example, it will be used to remove models from the dataset that are made by producers that are present in df_filter:

df = v.pipeline(
    mtcars,
    v.mutate(
        producer=lambda r: r["model"].str.split(' ', expand = True)[0]
    ),
)

df_filter = v.pipeline(
    df,
    v.select("producer"),
    v.filter(lambda r: r["producer"] in ["Merc", "Ferrari", "Toyota"]),
    v.distinct("producer")
)
df_filter
#>    producer
#> 7      Merc
#> 19   Toyota
#> 29  Ferrari

df_filtered = v.pipeline(
    df,
    v.anti_join(
        df_filter,
        by = "producer"
    ),
    v.distinct("producer"),
    v.arrange("producer")
)
df_filtered
#>     producer
#> 22       AMC
#> 14  Cadillac
#> 23    Camaro
#> 16  Chrysler
#> 2     Datsun
#> 21     Dodge
#> 6     Duster
#> 17      Fiat
#> 28      Ford
#> 18     Honda
#> 3     Hornet
#> 15   Lincoln
#> 27     Lotus
#> 30  Maserati
#> 0      Mazda
#> 24   Pontiac
#> 26   Porsche
#> 5    Valiant
#> 31     Volvo