Panda DataFrames¶

We apply dman to handle a DataFrame from panda.

To do so we will need the following imports

import datetime, textwrap, os, urllib.request, tempfile
import pandas as pd

import dman
from dman import tui

We then load the iris dataset.

KEY = "iris"
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
HEADER = ["sepal.length", "sepal.width", "petal.length", "petal.width", "variety"]

with tempfile.TemporaryDirectory() as base:
    path = os.path.join(base, f"{KEY}.data")
    urllib.request.urlretrieve(URL, path)

    df = pd.read_csv(
        path,
        sep=",",
        header=None,
        names=HEADER,
    )

    df["variety"] = df["variety"].apply(lambda x: x.split("-")[-1])

To turn the DataFrame into a storable we register it as one manually.

dman.register_storable(
    "pd_dataframe",
    pd.DataFrame,
    write=lambda df, path: df.to_csv(path),
    read=lambda path: pd.read_csv(path),
)
pd.DataFrame.__ext__ = '.csv'

We can now save a DataFrame directly using a record.

dman.save('iris', dman.record(df, stem='iris'))
df = dman.load('iris').content
print(df)
tui.walk_directory(dman.mount('iris'), show_content=['.json'])

     Unnamed: 0  sepal.length  ...  petal.width    variety
0             0           5.1  ...          0.2     setosa
1             1           4.9  ...          0.2     setosa
2             2           4.7  ...          0.2     setosa
3             3           4.6  ...          0.2     setosa
4             4           5.0  ...          0.2     setosa
..          ...           ...  ...          ...        ...
145         145           6.7  ...          2.3  virginica
146         146           6.3  ...          1.9  virginica
147         147           6.5  ...          2.0  virginica
148         148           6.2  ...          2.3  virginica
149         149           5.9  ...          1.8  virginica

[150 rows x 6 columns]
📂 .dman/cache/examples:cases:example_pandas/iris
┣━━ 📄 iris.csv (4.3 kB)
┗━━ 📄 iris.json (131 bytes)
     ──────────────────────────────────────────────────────────────────────────
      {
        "_ser__type": "_ser__record",
        "_ser__content": {
          "target": "iris.csv",
          "sto_type": "pd_dataframe"
        }
      }
     ──────────────────────────────────────────────────────────────────────────

Alternatively we can define a more complex storage architecture.

@dman.modelclass(storable=True)
class DataItem:
    data: pd.DataFrame = dman.recordfield(stem='data')
    description: str = ''
    created: str = dman.field(default_factory=lambda: datetime.datetime.now().isoformat())


item = DataItem(df,
    textwrap.dedent('''
    This is perhaps the best known database to be found in the pattern recognition literature.
    Fisher's paper is a classic in the field and is referenced frequently to this day.
    (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each,
    where each class refers to a type of iris plant. One class is linearly separable
    from the other 2; the latter are NOT linearly separable from each other.
    Predicted attribute: class of iris plant.
    This is an exceedingly simple domain.
    This data differs from the data presented in Fishers article (identified by
    Steve Chadwick, spchadwick '@' espeedaz.net ). The 35th sample should be:
    4.9,3.1,1.5,0.2,"Iris-setosa" where the error is in the fourth feature.
    The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa" where the errors are in the
    second and third features.
    Source: https://archive.ics.uci.edu/ml/datasets/iris
    ''')
)

container = dman.mdict(store_by_key=True, store_subdir=True)
container['iris'] = item
dman.save('dataframes', container)
item: DataItem = dman.load('dataframes')['iris']
print(item.data)
tui.walk_directory(dman.mount('dataframes'), show_content=['.json'])

     Unnamed: 0.1  Unnamed: 0  ...  petal.width    variety
0               0           0  ...          0.2     setosa
1               1           1  ...          0.2     setosa
2               2           2  ...          0.2     setosa
3               3           3  ...          0.2     setosa
4               4           4  ...          0.2     setosa
..            ...         ...  ...          ...        ...
145           145         145  ...          2.3  virginica
146           146         146  ...          1.9  virginica
147           147         147  ...          2.0  virginica
148           148         148  ...          2.3  virginica
149           149         149  ...          1.8  virginica

[150 rows x 7 columns]
📂 .dman/cache/examples:cases:example_pandas/dataframes
┣━━ 📂 iris
┃   ┣━━ 📄 data.csv (4.8 kB)
┃   ┗━━ 📄 iris.json (1.1 kB)
┃        ──────────────────────────────────────────────────────────────────────
┃         {
┃           "data": {
┃             "_ser__type": "_ser__record",
┃             "_ser__content": {
┃               "target": "data.csv",
┃               "sto_type": "pd_dataframe"
┃             }
┃           },
┃           "description": "\nThis is perhaps the best known database to be fo
┃           "created": "2023-01-04T10:12:25.820047"
┃         }
┃        ──────────────────────────────────────────────────────────────────────
┗━━ 📄 dataframes.json (380 bytes)
     ──────────────────────────────────────────────────────────────────────────
      {
        "_ser__type": "_ser__mdict",
        "_ser__content": {
          "store": {
            "iris": {
              "_ser__type": "_ser__record",
              "_ser__content": {
                "target": "iris/iris.json",
                "sto_type": "DataItem"
              }
            }
          },
          "store_by_key": true,
          "store_subdir": true
        }
      }
     ──────────────────────────────────────────────────────────────────────────

Total running time of the script: ( 0 minutes 0.939 seconds)

Gallery generated by Sphinx-Gallery