Skip to content

Snapshots

ArcticDB enables multi-symbol snapshotting. Snapshots enable multiple symbols to be versioned together via a human readable string name.

In practise this is useful to tie derived data with the source data.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import arcticdb as adb
# This example assumes the below variables (host, bucket, access, secret) are validly set
ac = adb.Arctic(f"s3://{HOST}:{BUCKET}?access={ACCESS}&secret={SECRET})

library= "my_library"

if library not in ac.list_libraries():
    ac.create_library(library)

library = ac[library]

# Assumes there are CSV files containing pricing data and factor data. Each time we've written ALL new factor files
# to their symbol, we'll take a snapshot across all symbols.
for i, f in enumerate(sorted(glob.glob('*.csv'), key=lambda f: f.split('_')[1].split('.')[0])):
    df = pd.read_csv(f)
    if 'FACTORS' in f:
        library.write(f, df)
        # SNAP_{i} will forever point to all symbols that exist at this time at their current latest version
        library.snapshot(f"SNAP_{i}")
    else:
        df = df.set_index(df.columns[0])
        df.index = df.index.to_datetime()

        library.append(pricing_symbol, df, write_if_missing=True)

snapshots = library.list_snapshots()
symbols = library.list_symbols(snapshot_name=list(snapshots.keys())[0])

To generate this data, the following code can be used:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import argparse
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

def run(num_files, num_symbols):
    starting_date = datetime.today() - timedelta(weeks=num_files)
    starting_date = datetime(starting_date.year, starting_date.month, starting_date.day)

    for file_num in range(num_files):
        index_size = 7 * 24
        this_file_starting_date = starting_date + timedelta(weeks=file_num)

        df = pd.DataFrame(np.random.randint(0,index_size,size=(index_size, num_symbols)), columns=['SYM_%d' % i for i in range(num_symbols)])

        df.index = pd.date_range(this_file_starting_date, periods=index_size, freq="H")

        df.to_csv(f"PRICING_{this_file_starting_date}.csv")
        if file_num % 3 == 0:
            df = pd.DataFrame(np.random.randint(0, 5,size=(5, num_symbols)), columns=['SYM_%d' % i for i in range(num_symbols)])
            df['FACTORS'] = ['FACTOR_1', 'FACTOR_2', 'FACTOR_3', 'FACTOR_4', 'FACTOR_5']
            df.to_csv(f"FACTORS_{this_file_starting_date}.csv")

        print(f"Written {file_num + 1} / {num_files}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('--num-files', type=int, default=15)
    parser.add_argument('--symbols-per-file', type=int, default=500)

    args = parser.parse_args()

    run(args.num_files, args.symbols_per_file)