WL#2223: NdbRecord

Affects: Server-6.0 — Status: Complete

Description
High Level Architecture
Low Level Design

Introduce NdbRecord
NdbRecord is linear memory per operation
NdbRecord can be mapped (almost) directly to mysql record

Lots of code can be removed and 
Lots of optimizations can be added if using records.

It's further more quite simple to make this backward compatible

---

a NdbRecord-object provides mapping to a full or a partial record stored in ndb
a NdbRecord can also be used together with bitmap for access of partial NdbRecord

a NdbRecord is prepared by API program
a NdbRecord is validated and finalized by ndbapi
a NdbRecord-object can be used simultanious 
      in several operations
      in several transactions
      in several threads                                        
      (i.e the actual NdbRecord object should not contain any data references)

Usage of NdbRecord can be phased (as devliverables or internally as)
* uk read
* pk read
* table scan
* index scan
* pk dml
* uk dml
* index bounds

* wl1496 is concidered depandant on this wl, as it will be soo much simpler
  to impl. once this is done

---

Logically readTuple(NdbRecord) <=> readTuples() getValue on each col in record
          insertTuple(NdbRecord) <=> inserTuple() setValue on each col in record
          etc...

NdbRecord *record= dict->createRecord(table, record_specification,
number_of_coulmns, sizeof(record_specification[0]);

NdbRecord *key_record= dict->createRecord(table, record_specification,
number_of_key_coulmns, sizeof(record_specification[0]);

char* buf0 = malloc(sizeof(TUPLE));
char* buf1 = malloc(sizeof(TUPLE));
char* buf2 = malloc(sizeof(TUPLE));

// set key in buf0
// set key in buf1

 // Read all columns in record into buf2
op0 = pTrans->readTuple(key_record, buf0, record, buf2,LM_Read, 0);

 // Read all columns in record & columnmask
op0 = pTrans->readTuple(key_record, buf0, record, buf2,LM_Read, columnmask);

API:

struct RecordSpecification {
  /*
    Column described by this entry (the column maximum size defines field
    size in row).
    Note that even when creating an NdbRecord for an index, the column
    pointers must be to columns obtained from the underlying table, not
    from the index itself.
  */
  const Column *column;
  /* Offset of data from start of a row. */
  Uint32 offset;
  /* Offset from start of row of byte containing NULL bit. */
  Uint32 nullbit_byte_offset;
  /* NULL bit, 0-7. */
  Uint32 nullbit_bit_in_byte;
};

class NdbDictionary {
  ...
  /*
    Create an NdbRecord for use in table operations.
  */
  NdbRecord *createRecord(const Table *table,
			  const RecordSpecification *recSpec,
			  Uint32 length,
			  Uint32 elemSize);

  /*
    Create an NdbRecord for use in index operations.
  */
  NdbRecord *createRecord(const Index *index,
                          const Table *table,
			  const RecordSpecification *recSpec,
			  Uint32 length,
			  Uint32 elemSize);
  void releaseRecord(NdbRecord *rec);
};

class NdbTransaction {
  ...
  /* Primary key NdbRecord operations. */
  NdbOperation *readTuple(const NdbRecord *key_rec, const char *key_row,
			  const NdbRecord *result_rec, char *result_row,
			  NdbOperation::LockMode lock_mode= NdbOperation::LM_Read,
			  const unsigned char *result_mask= 0);
  NdbOperation *insertTuple(const NdbRecord *rec, const char *row,
			    const unsigned char *mask= 0);
  NdbOperation *updateTuple(const NdbRecord *key_rec, const char *key_row,
			    const NdbRecord *attr_rec, const char *attr_row,
			    const unsigned char *mask= 0);
  NdbOperation *writeTuple(const NdbRecord *key_rec, const char *key_row,
			   const NdbRecord *attr_rec, const char *attr_row,
			   const unsigned char *mask);
  NdbOperation *deleteTuple(const NdbRecord *key_rec, const char *key_row);

  /*
    Scan a table, using NdbRecord to read out column data.

    The result_record pointer must remain valid until after the call to
    execute().

    The result_mask pointer is optional, if present only columns for which
    the corresponding bit in result_mask is set will be retrieved in the
    scan. The result_mask is copied internally, so in contrast to
    result_record need not be valid at execute().

    The parallel argument is the desired parallelism, or 0 for maximum
    parallelism (receiving rows from all fragments in parallel).
  */
  NdbScanOperation *
  scanTable(const NdbRecord *result_record,
            NdbOperation::LockMode lock_mode= NdbOperation::LM_Read,
            const unsigned char *result_mask= 0,
            Uint32 scan_flags= 0,
            Uint32 parallel= 0,
            Uint32 batch= 0);

  /*
    Do an index range scan (optionally ordered) of a table.

    The key_record describes the index to be scanned. It must be a
    primary key record for the index, ie. it must specify exactly the
    key columns of the index.

    The result_record describes the rows to be returned from the scan. For an
    ordered index scan, result_record must be a key record for the index to
    be scanned, that is it must include at least all of the column in the
    index.

    Both the key_record and the result_record must be created from the Index
    to be scanned, not from the underlying table.

    The call uses a callback function as a flexible way of specifying multiple
    range bounds. The callback will be called once for each bound to define
    lower and upper key value etc.

    The callback received a private callback_data void *, and the index of the
    bound (0 .. num_key_bounds). However, it is guaranteed that it will be
    called in ordered sequence, so it is permissible to ignore the passed
    bound_index and just return the values for the next bound (for example
    if data is kept in a linked list).

    The callback can return 0 to denote success, and -1 to denote error (the
    latter causing the creation of the NdbIndexScanOperation to fail).

    This multi-range method is only for use in mysqld code.
  */
private:
  NdbIndexScanOperation *
  scanIndex(const NdbRecord *key_record,
            int (*get_bound_callback)(void *callback_data,
                                      Uint32 bound_index,
                                      NdbIndexScanOperation::IndexBound & bound),
            void *callback_data,
            Uint32 num_key_bounds,
            const NdbRecord *result_record,
            NdbOperation::LockMode lock_mode= NdbOperation::LM_Read,
            const unsigned char *result_mask= 0,
            Uint32 scan_flags= 0,
            Uint32 parallel= 0,
            Uint32 batch= 0);

public:

  /* A convenience wrapper for simpler specification of a single bound. */
  NdbIndexScanOperation *
  scanIndex(const NdbRecord *key_record,
            const char *low_key,
            Uint32 low_key_count,
            bool low_inclusive,
            const char * high_key,
            Uint32 high_key_count,
            bool high_inclusive,
            const NdbRecord *result_record,
            NdbOperation::LockMode lock_mode= NdbOperation::LM_Read,
            const unsigned char *result_mask= 0,
            Uint32 scan_flags= 0,
            Uint32 parallel= 0,
            Uint32 batch= 0);

};

class NdbScanOperation {
  ...
  /*
    NdbRecord version of nextResult.
    This sets a pointer to the next row in out_row (if returning 0). This
    pointer is valid (only) until the next call to nextResult() with
    fetchAllowed==true.
    The NdbRecord object defining the row format was specified in the
    NdbTransaction::scanTable (or scanIndex) call.
  */
  int nextResult(const char * & out_row,
                 bool fetchAllowed = true, bool forceSend = false);

  /*
    NdbRecord versions of scan lock take-over operations.

    Note that calling NdbRecord scan lock take-over on an NdbRecAttr-style
    scan is not valid, nor is calling NdbRecAttr-style scan lock take-over
    on an NdbRecord-style scan.
  */

  /*
    Take over the lock without changing the row.
    Optionally also read from the row (call with default value NULL for row
    to not read any attributes.).
    The NdbRecord * is required even when not reading any attributes.
  */
  NdbOperation *lockCurrentTuple(NdbTransaction *takeOverTrans,
                                 const NdbRecord *record,
                                 char *row= 0,
                                 const unsigned char *mask= 0);

  /*
    Update the current tuple, NdbRecord version.
    Values to update with are contained in the passed-in row.
  */
  NdbOperation *updateCurrentTuple(NdbTransaction *takeOverTrans,
                                   const NdbRecord *record,
                                   const char *row,
                                   const unsigned char *mask= 0);

  /* Delete the current tuple. */
  NdbOperation *deleteCurrentTuple(NdbTransaction *takeOverTrans,
                                   const NdbRecord *record);

};

class NdbIndexScanOperation {
  ...
  /* Structure used to describe index scan bounds, for NdbRecord scans. */
  struct IndexBound {
    /* Row containing lower bound, or NULL for scan from the start. */
    const char *low_key;
    /* Number of columns in lower bound, for bounding by partial prefix. */
    Uint32 low_key_count;
    /* True for less-than-or-equal, false for strictly less-than. */
    bool low_inclusive;
    /* Row containing upper bound, or NULL for scan to the end. */
    const char * high_key;
    /* Number of columns in upper bound, for bounding by partial prefix. */
    Uint32 high_key_count;
    /* True for greater-than-or-equal, false for strictly greater-than. */
    bool high_inclusive;
    /*
      Value to identify this bound, may be read with get_range_no().
      Must be < 8192 (set to zero if not using range_no).
      Note that for ordered scans, the range_no must be strictly increasing
      for each range, or the result set will not be sorted correctly.
    */
    Uint32 range_no;
  };
};

- index scan bounds
- index scan merge

estimate two weeks from 25/1-07

- scan take-over
- review comment pk operations

jo: 3 days

- bitfields

jo: 3 days

- unique index

jo: 3 days

- psuedo attibutes

jo: 1 day

sum: 2 + 2 weeks

- blobs
1 week read, (preferably done during above)
1 week impl. (very uncertain)

sum: 2 + 2 + 2 weeks