This dataset example is a simplified record of item sales made by a chain of retail shops.
Each record is an individual item purchased from a shop belonging to this imaginary company.
Schema
Name | Type | Example Value | Description |
UTCTimestamp | float | 1451914602.0 | Unix timestamp representing the time the transaction took place. Timezone is UTC. |
TransactionID | str | T0000 | Identifier given to an individual transaction, or shopping basket. Several records can have the same TransactionID if the items were all bought together. |
ShopID | str | Glasgow | Identifier given to the shop branch at which the item was sold. |
ProductID | str | A63 | Identifier given to the type of product that was sold. |
SaleAmount | float | 23.0 | Amount the item was sold for. |
Example Records
1451914602.0 T0000 Glasgow G12 26.0 1451914602.0 T0000 Glasgow A63 35.0 1451914602.0 T0000 Glasgow D95 23.0 1451914602.0 T0000 Glasgow A58 81.0 1451914602.0 T0000 Glasgow B01 30.0 1451914602.0 T0000 Glasgow E64 4.0 1451914602.0 T0000 Glasgow E67 42.0 1451914602.0 T0000 Glasgow B57 16.0 1451914602.0 T0000 Glasgow G17 13.0 1451914632.0 T0001 Glasgow E64 81.0 1451914632.0 T0001 Glasgow G56 68.0 1451914632.0 T0001 Glasgow D11 21.0 1451914632.0 T0001 Glasgow B36 35.0 1451914632.0 T0001 Glasgow C87 16.0 1451914632.0 T0001 Glasgow A85 16.0 1451914646.0 T0002 Cardiff B75 75.0 1451914646.0 T0002 Cardiff B33 43.0 1451914646.0 T0002 Cardiff G72 91.0 1451914646.0 T0002 Cardiff A90 6.0 1451914646.0 T0002 Cardiff B53 77.0 1451914646.0 T0002 Cardiff D02 14.0 1451914646.0 T0002 Cardiff G97 19.0
Generating Sample Data Sets
import time import math import random CAT_LETTERS = "ABCDEFG" def get_transaction_id(transaction_number): """ :param transaction_number: int: Global counter representing number of transactions/shopping baskets generated so far. :return: str: A new TransactionID (the next in the sequence). TransactionID is "T" plus an int padded with leading 0's to 4 characters. e.g. "T0128". Note: transaction_number is updated by the record creating loop, not this function. """ new_transaction_id = "T%04d" % transaction_number return new_transaction_id def get_shop_id(): """ :return: str: A valid ShopID. In this case, a ShopID is the name of the town or city a shop is located in. Methodology: Select a random shop from a predefined list. """ shop_ids = ["Cardiff", "Exeter", "Glasgow"] return shop_ids[random.randint(0, len(shop_ids) - 1)] def get_product_id(): """ :return: str: A valid ProductID. A ProductID is a string consisting of a letter between A and G inclusive and two digits. e.g. "D45" Methodology: Randomly select a letter from "ABCDEFG", then randomly select a number between 0 and 99 inclusive and zero pad it. """ selected_category = CAT_LETTERS[random.randint(0, len(CAT_LETTERS) - 1)] selected_number = random.randint(0, 99) return "%s%02d" % (selected_category, selected_number) def get_sale_amount(): """ :return: float: A valid SaleAmount. Methodology: Select a random whole number (in floating point form) between 1.0 and 100.0 inclusive. """ return 1.0 * random.randint(1, 100) # Number of data points to generate. N = 10000 # File to store the data points in. data_file = open("sample_data.csv", "wb") # Number of transactions/shopping baskets generated so far. num_baskets_generated = 0 # Timestamp to use as starting point. current_time = math.floor(time.time()) # We'll decrement N for every record we create and stop when N hits 0. while N > 0: # Generate timestamp, transaction ID and shop ID. timestamp_utc = current_time current_time += random.randint(0, 100) transaction_id = get_transaction_id(num_baskets_generated) num_baskets_generated += 1 shop_id = get_shop_id() # Decide how many products to create in this shopping basket. num_items_to_create = None while num_items_to_create is None: rand_num = random.randint(1, 10) # Need to check that the number generated is within range of the number # of records left to create. if rand_num <= N: num_items_to_create = rand_num # Generate that many product entries. for x in xrange(0, num_items_to_create): product_id = get_product_id() sale_amount = get_sale_amount() # Write to the sample data file. entries = map(str, [timestamp_utc, transaction_id, shop_id, product_id, sale_amount]) data_file.write("\t".join(entries)+"\n") # Take away the number of entries just generated from the overall counter. N -= num_items_to_create # Close the file now that we've finished writing records. data_file.close()