WeightedListItem

WeightedListItem

A dictionary with weights. example:Content type: String (must be a valid filesystem path)
Name of the file containing the weights.Example file:

SIMPLE:
create terminators;
set types = (varchar);
set weights = 1;
add (".": 10);
add ("\;": 2);
add (" --": 1);
add (":": 1);


Complex:-- fields weights (aliases in parens)
-- =============
-- 1: FIPS code (fips) 1: uniform (uniform)
-- 2: county name (name) 2: population (population)
-- 3: state abreviation (st) 3: timezone weighting (tz)
-- 4: full state name (state) 4: in zone1 (tz90)
-- 5: ZIP prefix (zone) 5: in zone2 (tz9)
-- 6: gmt offset (gmt) 6 in zone3 (tz1)
--
create fips_county;
set types = (int, varchar, varchar, varchar, varchar, int);
set weights = 6;
set names = (fips, county, st, state, zone, gmt:uniform, population, tz, tz90, tz9, tz1);
add (47187,"Williamson County", "TN", "Tennesee", "3", -5:1, 117569, 1387, 1, 0, 0);
add (46137,"Ziebach County", "SD", "South Dakota", "5", -6:1, 2176, 1148, 1, 0, 0);
add (01127,"Walker County", "AL", "Alabama", "3", -6:1, 71027, 1148, 1, 0, 0);
add (45039,"Fairfield County", "SC", "South Carolina", "2", -5:1, 22394, 1387, 1, 0, 0);
add (39139,"Richland County", "OH", "Ohio", "4", -5:1, 127342, 1387, 1, 0, 0);
add (22041,"Franklin Parish", "LA", "Louisiana", "7", -6:1, 22163, 1148, 1, 0, 0);
add (29061,"Daviess County", "MO", "Mosourri", "6", -6:1, 7842, 1148, 1, 0, 0);

Attributes
Name Description Required Min Max Allowed Values
filename Content type: String (must be a valid filesystem path)
Name of the file containing the weights.Example file:

SIMPLE:
create terminators;
set types = (varchar);
set weights = 1;
add (".": 10);
add ("\;": 2);
add (" --": 1);
add (":": 1);


Complex:-- fields weights (aliases in parens)
-- =============
-- 1: FIPS code (fips) 1: uniform (uniform)
-- 2: county name (name) 2: population (population)
-- 3: state abreviation (st) 3: timezone weighting (tz)
-- 4: full state name (state) 4: in zone1 (tz90)
-- 5: ZIP prefix (zone) 5: in zone2 (tz9)
-- 6: gmt offset (gmt) 6 in zone3 (tz1)
--
create fips_county;
set types = (int, varchar, varchar, varchar, varchar, int);
set weights = 6;
set names = (fips, county, st, state, zone, gmt:uniform, population, tz, tz90, tz9, tz1);
add (47187,"Williamson County", "TN", "Tennesee", "3", -5:1, 117569, 1387, 1, 0, 0);
add (46137,"Ziebach County", "SD", "South Dakota", "5", -6:1, 2176, 1148, 1, 0, 0);
add (01127,"Walker County", "AL", "Alabama", "3", -6:1, 71027, 1148, 1, 0, 0);
add (45039,"Fairfield County", "SC", "South Carolina", "2", -5:1, 22394, 1387, 1, 0, 0);
add (39139,"Richland County", "OH", "Ohio", "4", -5:1, 127342, 1387, 1, 0, 0);
add (22041,"Franklin Parish", "LA", "Louisiana", "7", -6:1, 22163, 1148, 1, 0, 0);
add (29061,"Daviess County", "MO", "Mosourri", "6", -6:1, 7842, 1148, 1, 0, 0);
no 0 1
seed Random number generator seed of this Element. Overrides default seeding behavior. no 0 1
name (Class)Name of this element. Used to identify plugin Class. Full name is required. Example: com.en.myPluginPackage.myPuginClass no 0 1
valueColumn Content type: String
The specified list's column which contains the value to generate. Must match a column specified in the respective 'set names' statement.
yes 1 1
weightColumn Content type: String
The specified list's column which contains the weight to use. Must match a column specified in the respective 'set names' statement.
yes 1 1
id Identification String of this element. May be used to uniquely identify a field within the children of an Element. no 0 1
list Content type: String
The name of the list to use from the specified weight file. A list in the file starts with the keyword 'create'.
yes 1 1
Nodes
Name Description Required Min Max Allowed Values
sameChoiceAs Content type: Empty
Requires a <field> and a <generatorByID> attribute (in same table) to pick the row number from. If specified this WeightedListItem does not choose a random row, but it uses the same row as the referenced generator.
no 0 1

Examples

  1. Uniform Weekdays

    Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1, Tuesday: 1, Wednesday: 1, Thursday: 1, Friday: 1, Saturday: 1, Sunday: 1. The generated weekdays are all equally likely to be generated given that they are all weighted with 1.

    Schema config for Uniform Weekdays
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <!--
    /*******************************************************************************
    * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved.
    * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
    ******************************************************************************/
    --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd">
      <!-- All data is derived from this starting seed.
           If this seed is the same, the generated data will the same on each
           computer/node/platform.
           Change this seed to generate a different data set.-->
      <seed>1234567890L</seed>
    
      <rng name="PdgfDefaultRandom"/>
    
      <!--Default Scale factor for all tables -->
      <property name="SF" type="double">1</property>
    
      <table name="WEIGHTED_LIST_ITEM">
        <!-- if tables should scale with -SF command line argument.
             Specify your scaling formula here: -->
        <size>50 * ${SF}</size>
    
        <!--Uniform Weekdays-->
          <!--
            Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at
            'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1, Tuesday: 1, Wednesday: 1,
            Thursday: 1, Friday: 1, Saturday: 1, Sunday: 1. The generated weekdays are all equally likely to be generated
            given that they are all weighted with 1.
          -->
          <field name="weekday_uniform" size="" type="VARCHAR">
            <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="uniform"/>
          </field>
          </table>
    </schema>
    
    Output for Uniform Weekdays
    Thursday
    Wednesday
    Wednesday
    Sunday
    Friday
    Sunday
    Sunday
    Wednesday
    Wednesday
    Tuesday
    Monday
    Monday
    Thursday
    Monday
    Saturday
    Tuesday
    Thursday
    Thursday
    Monday
    Thursday
    Thursday
    Monday
    Thursday
    Wednesday
    Sunday
    Tuesday
    Thursday
    Saturday
    Sunday
    Tuesday
    Tuesday
    Friday
    Thursday
    Thursday
    Saturday
    Saturday
    Tuesday
    Saturday
    Thursday
    Sunday
    Thursday
    Saturday
    Thursday
    Wednesday
    Monday
    Monday
    Thursday
    Saturday
    Tuesday
    Monday
  2. Weighted Weekdays (Industry)

    Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 4.645, Tuesday: 6.645, Wednesday: 11.23, Thursday: 6.1, Friday: 5.4, Saturday: 2.23, Sunday: 1.0. The days Monday to Friday are more likely to be generated than days on the weekend with Wednesday being the most likely.

    Schema config for Weighted Weekdays (Industry)
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <!--
    /*******************************************************************************
    * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved.
    * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
    ******************************************************************************/
    --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd">
      <!-- All data is derived from this starting seed.
           If this seed is the same, the generated data will the same on each
           computer/node/platform.
           Change this seed to generate a different data set.-->
      <seed>1234567890L</seed>
    
      <rng name="PdgfDefaultRandom"/>
    
      <!--Default Scale factor for all tables -->
      <property name="SF" type="double">1</property>
    
      <table name="WEIGHTED_LIST_ITEM">
        <!-- if tables should scale with -SF command line argument.
             Specify your scaling formula here: -->
        <size>50 * ${SF}</size>
    
        <!--Weighted Weekdays (Industry)-->
          <!--
            Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at
            'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 4.645, Tuesday: 6.645, Wednesday: 11.23,
            Thursday: 6.1, Friday: 5.4, Saturday: 2.23, Sunday: 1.0. The days Monday to Friday are more likely to be generated
            than days on the weekend with Wednesday being the most likely.
          -->
          <field name="weekday_busy_industry" size="" type="VARCHAR">
            <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="busyDaysIndustry"/>
          </field>
          </table>
    </schema>
    
    Output for Weighted Weekdays (Industry)
    Tuesday
    Monday
    Tuesday
    Tuesday
    Friday
    Thursday
    Wednesday
    Friday
    Thursday
    Monday
    Tuesday
    Tuesday
    Thursday
    Monday
    Wednesday
    Wednesday
    Thursday
    Wednesday
    Thursday
    Monday
    Friday
    Wednesday
    Wednesday
    Wednesday
    Monday
    Monday
    Monday
    Tuesday
    Friday
    Saturday
    Friday
    Wednesday
    Sunday
    Thursday
    Thursday
    Monday
    Thursday
    Thursday
    Monday
    Thursday
    Wednesday
    Friday
    Monday
    Thursday
    Thursday
    Wednesday
    Monday
    Wednesday
    Thursday
    Friday
  3. Weighted Weekdays (Restaurants)

    Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1.4, Tuesday: 2.3, Wednesday: 3.0, Thursday: 3.1, Friday: 5.0, Saturday: 9.2, Sunday: 7.2. The weekdays Friday, Saturday, and Sunday are more likely to be generated than the other weekdays.

    Schema config for Weighted Weekdays (Restaurants)
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <!--
    /*******************************************************************************
    * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved.
    * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
    ******************************************************************************/
    --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd">
      <!-- All data is derived from this starting seed.
           If this seed is the same, the generated data will the same on each
           computer/node/platform.
           Change this seed to generate a different data set.-->
      <seed>1234567890L</seed>
    
      <rng name="PdgfDefaultRandom"/>
    
      <!--Default Scale factor for all tables -->
      <property name="SF" type="double">1</property>
    
      <table name="WEIGHTED_LIST_ITEM">
        <!-- if tables should scale with -SF command line argument.
             Specify your scaling formula here: -->
        <size>50 * ${SF}</size>
    
        <!--Weighted Weekdays (Restaurants)-->
          <!--
            Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at
            'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1.4, Tuesday: 2.3, Wednesday: 3.0,
            Thursday: 3.1, Friday: 5.0, Saturday: 9.2, Sunday: 7.2. The weekdays Friday, Saturday, and Sunday are more likely
            to be generated than the other weekdays.
          -->
          <field name="weekday_busy_restaurant" size="" type="VARCHAR">
            <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="buysDaysRestaurants"/>
          </field>
          </table>
    </schema>
    
    Output for Weighted Weekdays (Restaurants)
    Saturday
    Wednesday
    Saturday
    Sunday
    Saturday
    Saturday
    Saturday
    Wednesday
    Friday
    Monday
    Sunday
    Tuesday
    Saturday
    Sunday
    Saturday
    Saturday
    Thursday
    Friday
    Thursday
    Sunday
    Sunday
    Saturday
    Sunday
    Wednesday
    Monday
    Saturday
    Sunday
    Thursday
    Tuesday
    Monday
    Thursday
    Friday
    Sunday
    Sunday
    Saturday
    Saturday
    Saturday
    Saturday
    Friday
    Thursday
    Saturday
    Tuesday
    Saturday
    Friday
    Tuesday
    Friday
    Saturday
    Thursday
    Wednesday
    Sunday
  4. Weighted Ranges using Histogram

    Generates long numbers using a given histogram at 'config/usecases/distinctExample-weightedLists.txt'. For each row the range (min and max) is dynamically chosen from the weighted list. In 60% of the choices the number will be between 1 and 9, in 30% of the choices between 20 and 50, and in 10% of the choices will be the range between 100 and 199. The numbers within the dynamically chosen range are generated uniformly. No that max value needs to use the 'sameChoiceAs' parameter in order to pick the same range as for the min value.

    Schema config for Weighted Ranges using Histogram
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <!--
    /*******************************************************************************
    * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved.
    * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
    ******************************************************************************/
    --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd">
      <!-- All data is derived from this starting seed.
           If this seed is the same, the generated data will the same on each
           computer/node/platform.
           Change this seed to generate a different data set.-->
      <seed>1234567890L</seed>
    
      <rng name="PdgfDefaultRandom"/>
    
      <!--Default Scale factor for all tables -->
      <property name="SF" type="double">1</property>
    
      <table name="WEIGHTED_LIST_ITEM">
        <!-- if tables should scale with -SF command line argument.
             Specify your scaling formula here: -->
        <size>50 * ${SF}</size>
    
        <!--Weighted Ranges using Histogram-->
          <!--
            Generates long numbers using a given histogram at 'config/usecases/distinctExample-weightedLists.txt'. For each
            row the range (min and max) is dynamically chosen from the weighted list. In 60% of the choices the number will
            be between 1 and 9, in 30% of the choices between 20 and 50, and in 10% of the choices will be the range between
            100 and 199. The numbers within the dynamically chosen range are generated uniformly. No that max value needs to
            use the 'sameChoiceAs' parameter in order to pick the same range as for the min value.
          -->
          <field name="long_number_by_histogram" size="" type="NUMERIC">
            <gen_LongNumber>
              <min>
                <gen_WeightedListItem filename="config/usecases/distinctExample-weightedLists.txt" id="range" list="weighted_ranges_test2" valueColumn="min" weightColumn="weighted"/>
              </min>
              <max>
                <gen_WeightedListItem filename="config/usecases/distinctExample-weightedLists.txt" list="weighted_ranges_test2" valueColumn="max" weightColumn="weighted">
                  <sameChoiceAs field="long_number_by_histogram" generatorByID="range"/>
                </gen_WeightedListItem>
              </max>
            </gen_LongNumber>
          </field>
          </table>
    </schema>
    
    Output for Weighted Ranges using Histogram
    2
    47
    5
    2
    6
    9
    45
    2
    8
    8
    6
    5
    9
    8
    40
    5
    26
    154
    29
    40
    40
    45
    46
    153
    6
    23
    2
    34
    21
    1
    1
    1
    40
    36
    4
    26
    21
    2
    42
    46
    33
    151
    29
    9
    21
    30
    27
    6
    188
    24
2.6_#1486_b758 | 2016-05-24