Source code for data_migrator.models.fields

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import uuid
import json
import datetime
from dateutil import parser as p
from functools import partial

from data_migrator.exceptions import ValidationException, DataException
from data_migrator.exceptions import DefinitionException
from data_migrator.utils import isstr


def new_exception(field, exc_class, msg, *args):
    msg = "%s[%s]: " + msg
    return exc_class(msg % ((field.__class__.__name__, field.name) + args))


def _replace(format_str, x):
    return format_str.format(*x)


[docs]class BaseField(object): '''Base column definition for the transformation DSL The following arguments are available to all field types. All are optional. Arguments: pos (int): If positive or zero this denotes the column in the source data to select and store in this field. If not set (or negative) the fields is interpreted as not selecting just a column from the source but to take the full row in the parse function name (str): The name of this field. By default this is the name provided in the model declaration. This attribute is to replace that name by the final column name. default: The default value to use if the source column is found to be a ``null`` field or if the parse function returns None. This attribute has default values for Fields that are not Null<xxx>Fields. For example NullStringField has both NULL and empty string as empty value. :class:`~.StringField` only has empty string as empty value. With this field it can be changed to some other standard value. Consider a Country field as string and setting it to the home country by default. key (boolean): If set, this indicates the field is a key field for identification of the object. nullable (str): If set it will match the source column value and consider this a ``None`` value. By default this attribute is set to ``None``. Note that for none Null fields ``None`` will be translated to :attr:`~.default`. replacement: If set, this is a pre-emit replacement function. This could be used to insert dynamic replacement lookup select queries, adding more indirection into the data generation. Value could be either function or a string. required (boolean): If set, this indicates the field is required to be set. parse: If set this is the parsing function to replace the read value into something to use further down the data migration. Use this for example to clean phone numbers, translate country definitions into alpha3 codes, or to translate ID's into values based on a separately loaded lookup table. validate: Expects a function that returns a boolean, and used to validate the input data. Expecting data within a range or a specific format, add a column validator here. Raises :exc:`~.ValidationException` if set and false. max_length (int): In case of :class:`~.StringField` use this to trim string values to maximum length. unique (boolean): If ``True``, *data-migrator* will check uniqueness of intermediate values (after parsing). Default is ``False``. In relationship with the default manager this will keep track of values for this field. The manager can raise exceptions if uniqueness is violated. Note that it is up to the manager to either fail or drop the record if the exception is raised. anonymizer: Add an additional function that will be called at emit to anonymize the data validate_output: A pre-emit validator used to scan the bare output and raise exceptions if output is not as expected. creation_order: An automatically generated attribute used to determine order of specification, and used in the emitting of dataset. ''' creation_order = 0 schema_type = 'object' def __init__(self, pos=-1, name="", default=None, nullable="NULL", key=False, required=False, replacement=None, parse=None, validate=None, anonymize=None, max_length=None, unique=False, validate_output=None): # default value if null self.default = default if default is not None else getattr(self.__class__, 'default', default) # key indicated key field self.key = key # fixed position in the row to read if max_length and self.schema_type != "string": raise DefinitionException("Cannot set max_length on on string") self.max_length = max_length if isinstance(max_length, int) else None # name of this field (will be set in Model class construction) self.name = name # input string that defines null -> None self.nullable = nullable # some function to apply to value self.parse = parse or getattr(self.__class__, 'parse', None) self.pos = int(pos) # replace string to use in output if isstr(replacement): replacement = partial(_replace, replacement) self.replace = getattr(self.__class__, 'replace', replacement) # required indicates must be filled in self.required = required # unique indicates a unique field self.unique = unique # anonymize is the anonymization function self.anonymize = anonymize() if isinstance(anonymize, type) \ else anonymize # some function to apply to value self.validate = validate or getattr(self.__class__, 'validate', None) # output validator self.validate_output = validate_output # creation_order is required for orderdict to retain order of fields self.creation_order = BaseField.creation_order BaseField.creation_order += 1
[docs] def scan(self, row): '''scan row and harvest distinct value. Takes a row of data and parses the required fields out of this. Args: row (list): array of source data Returns: parsed and processed value. Raises: :class:`~.ValidationException`: raised if explicit validation fails. ''' # see if we want to read a column in the row v = None if self.pos >= 0: try: _v = row[self.pos] except: raise DataException('parsing %r, row len %d, index %d not found', self.name, len(row), self.pos) # do null check if enabled if self.nullable is not None and _v == self.nullable: return v v = _v if self.validate and not self.validate(v): raise ValidationException('field %r input data did not validate' % self.name) # apply intermediate function on output, default is stripping if self.parse: v = self.parse(v) elif self.parse: v = self.parse(row) or v # delegate to inner function, to reuse this logic return self._value(v)
[docs] def emit(self, v, escaper=None): '''helper function to export this field. Expects a value from the model to be emitted Args: v: value to emit escaper: escaper function to apply on value Returns: emitted value. Raises: :class:`~.ValidationException`: raised if explicit validation fails.''' if self.max_length and isstr(v): v = v[:self.max_length] if v is None: v = self.default if self.default is not None else v if self.validate_output and not self.validate_output(v): raise ValidationException("not able to validate %s=%s" % (self.name, v)) # allow external function (e.g. SQL escape) # anonymize this data if self.anonymize: v = self.anonymize(v) # check if we have a replacement string to take into account if self.replace: if not isinstance(v, tuple): v = (v,) v = self.replace(v) # pylint: disable=not-callable elif escaper: v = escaper(v) return v
[docs] def json_schema(self, name=None): '''generate json_schema representation of this field Args: name: name if not taken from this field Returns: python representation of json schema ''' t = self.schema_type if 'Null' in self.__class__.__name__: t = [t, "null"] t = {'type': t} if self.key: t['key'] = True if self.max_length and self.schema_type == "string": t['maxLength'] = self.max_length if hasattr(self, 'schema_format'): t['format'] = self.schema_format return {name or self.name: t}
def _value(self, v): # pylint: disable=R0201 return v
[docs]class HiddenField(BaseField): '''Non emitting Field for validation and checking. a field that accepts, but does not emit. It is useful for uniqueness checked and more. Combine this with a row parse and check the complete row. ''' pass
[docs]class IntField(BaseField): '''Basic integer field handler''' default = 0 schema_type = 'integer' def _value(self, v): return int(v) if isstr(v) else v
IntegerField = IntField
[docs]class DateTimeField(BaseField): '''Basic datetime field handler''' schema_type = 'string' schema_format = 'date-time'
[docs] def __init__(self, f=None, **kwargs): """ Args: f: format of the datetime Default is ``%Y-%m-%dT%H:%M:%SZ`` (RFC3999) """ self.format = f or getattr(self.__class__, 'format', "%Y-%m-%dT%H:%M:%SZ") super(DateTimeField, self).__init__(**kwargs)
def _value(self, v): if isstr(v): if v == "": return None try: v = p.parse(v) except ValueError: raise DataException("%s could not parse date %s", self.name, v) return v
[docs] def emit(self, v, escaper=None): if v is not None and isinstance(v, datetime.datetime): v = v.strftime(self.format) return super(DateTimeField, self).emit(v, escaper)
[docs]class UTCNowField(DateTimeField): '''UTCNow generating field. a field that generates a ``UTCNow`` ''' def _value(self, v): '''override and automatically set''' return datetime.datetime.utcnow()
[docs]class NullIntField(BaseField): '''Null integer field handler. a field that accepts the column to be integer and can also be None, which is not the same as 0 (zero). ''' schema_type = 'integer' def _value(self, v): return int(v) if isstr(v) else v
[docs]class StringField(BaseField): '''String field handler, a field that accepts the column to be string.''' default = "" schema_type = 'string' def _value(self, v): return v.strip() if isstr(v) else v
[docs]class NullStringField(BaseField): '''Null String field handler. a field that accepts the column to be string and can also be None, which is not the same as empty string (""). ''' schema_type = 'string' def _value(self, v): return v.strip() if isstr(v) else v
[docs]class BooleanField(BaseField): '''Boolean field handler. a bool that takes any cased permutation of true, yes, 1 and translates this into ``True`` or ``False`` otherwise. ''' default = False schema_type = 'boolean' def _value(self, v): try: return v.lower()[0] in ['y', 't', '1'] except (AttributeError, IndexError): return False
class DefaultField(BaseField): '''DefaultField always returns the default value''' def emit(self, v, escaper=None): """Emit is overwritten return default always""" return super(DefaultField, self).emit(self.default, escaper) def _value(self, v): '''override so we can never set''' return self.default
[docs]class NullField(DefaultField): '''NULL returning field by generating None'''
[docs] def json_schema(self, name=None): '''generate json_schema representation of this field''' return {name or self.name: {'type': 'null'}}
[docs]class UUIDField(BaseField): '''UUID generating field. a field that generates a ``str(uuid.uuid4())`` ''' schema_type = 'string' def __init__(self, *args, **kwargs): kwargs['default'] = None super(UUIDField, self).__init__(*args, **kwargs) def _value(self, v): '''override and automatically set''' return str(uuid.uuid4())
[docs]class ObjectField(BaseField): '''JSON object field''' default = {} schema_type = 'object'
DictField = ObjectField
[docs]class ArrayField(BaseField): '''JSON array field''' default = [] schema_type = 'array'
ListField = ArrayField
[docs]class JSONField(BaseField): '''a field that takes the values and spits out a JSON encoding string. Great for maps and lists to be stored in a string like db field. '''
[docs] def emit(self, v, escaper=None): """Emit is overwritten to add the to_json option.""" if v is None: v = self.default if self.default is not None else v v = json.dumps(v) return super(JSONField, self).emit(v, escaper)
[docs]class MappingField(BaseField): '''Map based field translator. a field that takes the values translates these according to a map. Great for identity column replacements. If needed output can be translated as ``json``, for example if the map returns lists. '''
[docs] def __init__(self, data_map, as_json=False, strict=False, **kwargs): """ Args: data_map: The data_map needed to translate. Note the fields returns :attr:`~Field.default` if it is not able to map the key. as_json: If ``True``, the field will be output as json encoded. Default is ``False`` strict: If ``True``, the value must by found in the map. Default is ``False`` """ super(MappingField, self).__init__(**kwargs) if strict and self.default: data_map[self.default] = self.default self.data_map = data_map self.as_json = as_json self.strict = strict
[docs] def emit(self, v, escaper=None): """Emit is overwritten to add the to_json option""" if v is None: v = self.default if self.default is not None else v if self.strict: try: v = self.data_map[v] except KeyError: raise DataException("%s - %s not in map" % (self.name, v)) else: v = self.data_map.get(v, self.default if self.default is not None else v) if self.as_json: v = json.dumps(v) return super(MappingField, self).emit(v, escaper)
[docs]class ModelField(BaseField): '''Model relation for hierarchical structures. a field that takes another model to build hierarchical structures. '''
[docs] def __init__(self, fields, strict=None, **kwargs): """ Args: fields: relationship to another model. strict (boolean): model is considered strict. """ super(ModelField, self).__init__(**kwargs) self.strict = strict self.fields = fields
[docs] def json_schema(self, name=None): name = name or self.name _res = super(ModelField, self).json_schema()[name] _p = {} if isinstance(self.fields, list): for i in self.fields: _p.update(i.json_schema()) elif isinstance(self.fields, dict): for k, v in self.fields.items(): _p.update(v.json_schema(name=k)) else: _p.update(self.fields.json_schema(name=self.fields.name)) _res['properties'] = _p if self.strict is not None: _res['additionalProperties'] = not self.strict return {name: _res}
[docs] def emit(self, v, escaper=None): """Emit is overwritten to add the to_json option""" if v is None: v = self.default if self.default is not None else v else: v = model.emit(v, escaper) ###FIXME: not sure this is correct # anonymize this data if self.anonymize: v = self.anonymize(v) return v