# Samizdat RDF storage
#
#   Copyright (c) 2002-2005  Dmitry Borodaenko <angdraug@debian.org>
#
#   This program is free software.
#   You can distribute/modify this program under the terms of
#   the GNU General Public License version 2 or later.
#
# see doc/rdf-storage.txt for introduction and Samizdat Squish definition;
# see doc/storage-impl.txt for explanation of implemented algorithms
#
# vim: et sw=2 sts=2 ts=8 tw=0

require 'samizdat/cache'

module Samizdat

# provides access to RDF storage via DBI-like interface
#
class RDF
  # initialize class attributes
  #
  def initialize(db, map)
    @db = db

    @map = {}
    map['map'].each_pair {|p, m|
      @map[ p.sub(/\A(\S+?)::/) { map['ns'][$1] } ] = m
    }

    @select_cache = Cache.new(nil, 1000)
  end

  # cached internal property map with expanded namespaces
  attr_reader :map

  # reference to the DBI database
  attr_reader :db

  # get value of subject's property
  #
  def get_property(subject, property)
    object, = select_one %{
SELECT ?object WHERE (#{property} #{subject} ?object)}
    object
  end

  # get one query answer (similar to DBI#select_one)
  #
  def select_one(query, params={})
    @db.select_one(*select(query, params))
  end

  # get all query answers (similar to DBI#select_all)
  #
  def select_all(query, limit=nil, offset=nil, params={}, &p)
    sql, *values = select(query, params)
    sql = sql.dup
    sql << "\nLIMIT #{limit}" if limit
    sql << "\nOFFSET #{offset}" if offset
    if block_given?
      @db.select_all sql, *values, &p
    else
      @db.select_all sql, *values
    end
  end

  # accepts String or pre-parsed SquishQuery object, caches SQL by String
  #
  def select(query, params={})
    query.kind_of?(String) and
      query = @select_cache.fetch_or_add(query) { SquishQuery.new(query) }
    query.kind_of?(SquishQuery) or raise ProgrammingError,
      "String or SquishQuery expected"
    query.to_sql(params)
  end

  # merge Squish query into RDF database
  #
  # returns list of new ids assigned to blank nodes listed in INSERT section
  #
  # always run it inside transaction, with AutoCommit disabled
  #
  def assert(query, params={})
    q = SquishQuery.new(query)
    raise ProgrammingError, 'Wrong query type: assert expected' unless
      q.type == :assert
    insert, update = q.nodes

    # Stage 1: Resources
    v = {}   # v: node -> value
    new = {} # new[node] if node was inserted at this stage
    q.pm.each do |node, map|
      if node =~ SquishQuery::INTERNAL   # internal resource
        v[node] = $1   # resource id
      elsif node =~ SquishQuery::PARAMETER or node =~ SquishQuery::LITERAL
        v[node] = node   # pass parametrized value or string literal as is
      elsif node =~ SquishQuery::BN and not map.rassoc(0)
        # blank node occuring only in object position
        v[node] = update[node]   # todo: what if update[node].nil?
      elsif node =~ SquishQuery::BN   # resource blank node
        s = SquishQuery.new(
          :type => :select,
          :nodes => [node],
          :pattern => subgraph(node, q.pattern),
          :strings => q.strings
        )
        v[node], = select_one(s, params) unless insert.include?(node)
        if v[node].nil?
          db.do "INSERT INTO Resource (label) VALUES (?)",
            q.c[map.rassoc(0)[0]][0][0]   # subject table
          v[node], = db.select_one "SELECT MAX(id) FROM Resource"
          new[node] = true
        end
      else   # external resource
        v[node], = db.select_one "SELECT id FROM Resource
          WHERE literal = 'false' AND uriref = 'true' AND label = ?", node
        if v[node].nil?
          db.do "INSERT INTO Resource (uriref, label) VALUES (true, ?)", node
          v[node], = db.select_one "SELECT MAX(id) FROM Resource"
          new[node] = true
        end
      end
    end

    # Stage 2: Properties
    a = {}   # a: alias -> positions*
    q.c.each_index do |i|
      if a[q.c[i][2]] then a[q.c[i][2]].push i
      else a[q.c[i][2]] = [i] end
    end

    a.each do |alias_, map|
      key_node = q.pattern[map[0]][1]   # subject
      table = q.c[map[0]][0][0]   # subject table
      data = []
      map.each do |position|
        node = q.pattern[position][2]   # object
        if new[key_node] or update[node]
          data.push [
            q.c[position][1][1],   # object field
            q.substitute_literals(v[node])   # value
          ]
          # todo: prove that value is not nil
        end
      end
      if new[key_node]
        data.unshift [ 'id', v[key_node] ]
        # when id is inserted, insert_resource() trigger does nothing
        sql = "INSERT INTO #{table} ("+
          data.collect {|field, value| field }.join(', ')+') VALUES ('+
          data.collect {|field, value| value }.join(', ')+')'
      elsif data.length > 0
        sql = "UPDATE #{table} SET "+
          data.collect {|field, value| field+' = '+value }.join(', ')+
          " WHERE id = #{v[key_node]}"
      end
      if sql
        sql, values = SquishQuery.substitute_parameters(sql, params)
        db.do(sql, *values)
      end
    end

    return insert.collect {|node| v[node] }
  end

private

  def subgraph(node, pattern)
    subgraph = [node]
    w = []
    begin
      stop = true
      pattern.each do |triple|
        if subgraph.include? triple[1] and not w.include? triple
          subgraph.push triple[2]
          w.push triple
          stop = false
        end
      end
    end until stop
    return w
  end
end


require 'uri/common'

# parse Squish query and translate triples to relational conditions
#
# provides access to internal representation of the parsed query and utility
# functions to deal with Squish syntax
#
class SquishQuery
  # regexp for internal resource reference
  INTERNAL = Regexp.new(/\A([[:digit:]]+)\z/).freeze

  # regexp for blank node mark and name
  BN = Regexp.new(/\A\?([[:alnum:]_]+)\z/).freeze

  # regexp for parametrized value
  PARAMETER = Regexp.new(/\A:([[:alnum:]_]+)\z/).freeze

  # regexp for replaced string literal
  LITERAL = Regexp.new(/\A'\d+'\z/).freeze

  # regexp for number
  NUMBER = Regexp.new(/\A-?[[:digit:]]+(\.[[:digit:]]+)?\z/).freeze

  # regexp for operator
  OPERATOR = Regexp.new(/\A(\+|-|\*|\/|<|<=|>|>=|=|I?LIKE|NOT|AND|OR|IS|NULL)\z/i).freeze

  # regexp for aggregate function
  AGGREGATE = Regexp.new(/\A(avg|count|max|min|sum)\z/i).freeze

  QUERY = Regexp.new(/\A\s*(SELECT|INSERT|UPDATE)\b\s*(.*?)\s*
        \bWHERE\b\s*(.*?)\s*
        (?:\bLITERAL\b\s*(.*?))?\s*
        (?:\bGROUP\s+BY\b\s*(.*?))?\s*
        (?:\bORDER\s+BY\b\s*(.*?)\s*(ASC|DESC)?)?\s*
        (?:\bUSING\b\s*(.*?))?\s*\z/mix).freeze

  # extract common Squish query sections, perform namespace substitution,
  # generate query pattern graph, call transform_pattern,
  # determine query type and parse nodes section accordingly
  #
  def initialize(query)
    query.nil? and raise ProgrammingError, "SquishQuery: query can't be nil"
    query = query.dup
    if query.kind_of? Hash  # pre-parsed query for RDF#assert
      @type = query[:type]
      @nodes = query[:nodes]
      @pattern = query[:pattern]
      @strings = query[:strings]
      @literal = @group = @order = ''
      transform_pattern
      return self
    elsif not query.kind_of? String
      raise ProgrammingError,
        "Bad query initialization parameter class: #{query.class}"
    end

    # replace string literals with 'n' placeholders (also see #substitute_literals)
    @strings = []
    query.gsub!(/'(''|[^'])*'/m) do
      @strings.push $&
      "'" + (@strings.size - 1).to_s + "'"
    end

    match = QUERY.match(query) or raise ProgrammingError,
      "Malformed query: are keywords SELECT, INSERT, UPDATE or WHERE missing?"
    match, @key, @nodes, @pattern, @literal, @group, @order, @order_dir,
      @ns = match.to_a.collect {|m| m.to_s }
    @key.upcase!
    @order_dir.upcase!

    # namespaces
    # todo: validate ns
    @ns = ('' == @ns or /\APRESET\s+NS\z/ =~ @ns) ? config['ns'] :
      Hash[*@ns.gsub(/\b(FOR|AS|AND)\b/i, '').scan(/\S+/)]
    @pattern = @pattern.scan(/\(.*?\)/).collect do |c|
      c.scan(/\((\S+)\s+(\S+)\s+(.*?)\)/).flatten.collect do |u|
        u.sub(/\A(\S+?)::/) do
          @ns[$1] or raise ProgrammingError,
          "Undefined namespace prefix #{$1}"
        end
      end
    end

    # validate SQL expressions
    validate_expression(@literal)
    @group.split(/\s*,\s*/).each {|group| validate_expression(group) }
    validate_expression(@order)

    transform_pattern

    query.scan(/\?\b.+?\b/) do |node|
      raise ProgrammingError, "Blank node '#{node}' is not bound by the query pattern" unless @b[node]
    end

    @literal.scan(/\?\b.+?\b/) do |node|
      @jc.delete(@b[node]+' IS NOT NULL')   # treat it as ground
    end

    # determine query type, parse and validate nodes section
    if 'SELECT' == @key
      @type = :select
      @nodes = @nodes.split(/\s*,\s*/)
      @nodes.each {|node| validate_expression(node) }
    else
      @type = :assert
      if 'UPDATE' == @key
        insert = ''
        update = @nodes
      elsif 'INSERT' == @key and @nodes =~ /\A\s*(.*?)\s*(?:\bUPDATE\b\s*(.*?))?\s*\z/
        insert, update = $1, $2.to_s
      else
        raise ProgrammingError,
          "Query doesn't start with one of SELECT, INSERT, or UPDATE"
      end
      insert = insert.split(/\s*,\s*/).each {|s|
        raise ProgrammingError, "Blank node expected in INSERT section instead of '#{s}'" unless s =~ BN
      }
      update = Hash[*update.split(/\s*,\s*/).collect {|s|
        s.split(/\s*=\s*/)
      }.each {|node, value|
        raise ProgrammingError, "Blank node expected on the left side of UPDATE assignment instead of '#{bn}'" unless node =~ BN
        validate_expression(value)
      }.flatten!] unless '' == update
      @nodes = [insert, (update or {})]
    end
    # todo: don't bind list

    return self
  end

  # replaced literals
  attr_reader :strings

  # starting keyword, SELECT, INSERT or UPDATE
  attr_reader :key

  # type of query, :select or :assert
  attr_reader :type

  # blank variables control section
  attr_reader :nodes

  # query pattern graph as array of triples [ [p, s, o], ... ]
  attr_reader :pattern

  # literal SQL expression
  attr_reader :literal

  # SQL GROUP BY expression
  attr_reader :group

  # SQL order expression
  attr_reader :order

  # direction of order, ASC or DESC
  attr_reader :order_dir

  # query namespaces mapping
  attr_reader :ns

  # replace 'n' substitutions with query string literals (see #new, #LITERAL)
  #
  def substitute_literals(s)
    s.kind_of?(String) ? s.gsub(/'(\d+)'/) { @strings[$1.to_i] or $& } : s
  end

  # replace RDF query parameters in SQL query with '?' marks,
  # return resultant query and array of parameter values
  #
  def SquishQuery.substitute_parameters(sql, params={})
    values = []
    sql.gsub!(/\B:([[:alnum:]_]+)/) do   # see #PARAMETER
      name = $1.to_sym
      params.has_key?(name) or raise ProgrammingError,
        "Missing value for :#{name} in parametrized query"
      values.push(params[name])
      '?'
    end
    [sql, values]
  end

  # replace schema uri with namespace prefix
  #
  def SquishQuery.uri_shrink!(uriref, prefix, uri)
    uriref.gsub!(/\A#{uri}([^\/#]+)\z/) {"#{prefix}::#{$1}"}
  end

  # replace schema uri with a prefix from a supplied namespaces hash
  #
  def SquishQuery.ns_shrink(uriref, namespaces=config['ns'])
    u = uriref.dup
    namespaces.each {|p, uri| SquishQuery.uri_shrink!(u, p, uri) and break }
    return u
  end

  # replace schema uri with a prefix from query namespaces
  #
  def ns_shrink(uriref)
    SquishQuery.ns_shrink(uriref, @ns)
  end

  # transform Squish query pattern graph to SQL join
  #
  # produces following core mappings:
  #
  #   c: position -> field, table, alias
  #   pm: node -> positions*
  #   jc: conditions*
  #
  # and utility mappings:
  #
  #   b: node -> alias.field
  #   table: alias -> table
  #
  def transform_pattern(pattern = @pattern.dup)
    # Stage 1: Predicate Mapping
    pm = {}   # positional map: pm[node]=>[[clause, position], ...]
    c = []    # c[i][0]=ci.s, c[i][1]=ci.o, c[i][2]=ci.a (storage-impl.txt)
    pattern.each_index do |i|
      p, s, o = pattern[i]
      # validate the triple
      raise ProgrammingError, "Valid uriref expected in predicate position instead of '#{p}'" unless p =~ URI::URI_REF
      [s, o].each {|node| raise ProgrammingError, "Resource or blank node name expected instead of '#{node}'" unless
        node =~ INTERNAL or node =~ URI::URI_REF or node =~ BN }
      map = rdf.map[p]
      if map and (s =~ BN or s =~ INTERNAL or map['Resource'])
        # internal predicate and subject is mappable to Resource table
        c[i] = [   # list of possible mappings into internal tables
          map.keys.collect {|t| [t, 'id'] },   # subject
          map.keys.collect {|t| [t, map[t]] }  # object
        ]
        # reverse mapping of the node occurences
        if pm[s] then pm[s].push [i, 0]
        else pm[s] = [[i, 0]] end
        if pm[o] then pm[o].push [i, 1]
        else pm[o] = [[i, 1]] end
      else
        # assume reification for unmapped predicates:
        #
        #            | (rdf::predicate ?_stmt_#{i} p)
        # (p s o) -> | (rdf::subject ?_stmt_#{i} s)
        #            | (rdf::object ?_stmt_#{i} o)
        #
        rdf = config['ns']['rdf']
        stmt = "?_stmt_#{i}"
        pattern.push [rdf+'predicate', stmt, p],
                     [rdf+'subject', stmt, s],
                     [rdf+'object', stmt, o]
      end
    end

    # refine ambiguous properties
    pm.each do |node, map|
      map.each do |i, j|   # i, j = clause, position
        if (big = c[i][j]).length > 1  # more than one mapping
          map.each do |k, l|   # optimize: don't look back
            if c[k][l].length < big.length and (refined = big & c[k][l]).length > 0
              c[i][j] = big = refined   # refine node...
              c[i][1-j].collect! {|x|
                refined.assoc(x[0])? x: nil
              }.compact!   # ...and its pair
              # c[i][1-j] is node's pair in the c[i] triple
            end
          end
        end
      end
    end

    # remove remaining ambiguous mappings
    # todo: split query for ambiguous mappings
    for m in c do
      next unless m   # means m is reified
      m[0] = m[0][0]
      m[1] = m[1][0]
    end

    # Stage 2: Relation Aliases and Join Conditions
    ac = 'a'    # alias counter
    jc = []     # join conditions
    b = {}      # bind nodes to fields {node => 'alias.field'}
    table = {}  # relation aliases {alias => table}

    # define relation aliases
    pm.each do |node, map|
      map.each do |i, j|   # i, j = clause, position
        next if j == 1 or c[i][2]
        map.each do |k, l|   # optimize: don't look forward
          # same node, same table -> same alias
          if c[k][2] and c[k][0][0] == c[i][0][0]
            c[i][2] = c[k][2]
            break
          end
        end
        if c[i][2].nil?   # new alias
          c[i][2] = ac.dup
          table[ac] = c[i][0][0]
          ac.next!
        end
      end
    end   # optimize: unnecessary aliases are generated

    pm.each do |node, map|
      # node binding
      b[node] = c[map[0][0]][2]+'.'+c[map[0][0]][map[0][1]][1]

      ground = false

      # join conditions
      map[1, map.length-1].each do |i, j|   # i, j = clause, position
        if (tmp = c[i][2]+'.'+c[i][j][1]) != b[node]
          jc.push b[node]+' = '+tmp
          ground = true
        end
      end

      # ground non-blank nodes
      if node !~ BN
        if node =~ INTERNAL   # internal resource id
          jc.push b[node]+' = '+$1
        elsif node =~ PARAMETER or node =~ LITERAL
          jc.push b[node]+' = '+node
          # optimize: elsif c[map[0][0]][0] == 'Resource'
        else   # external resource
          table[ac] = 'Resource'
          jc.push b[node]+' = '+ac+'.id AND '+
            ac+".literal = 'false' AND "+
            ac+".uriref = 'true' AND "+
            ac+".label = '"+node+"'"
          ac.next!
        end
        ground = true
      end

      # ground dangling blank nodes to existential quantifier
      if not ground
        # optimize: check if b[node] can be NULL
        jc.push b[node]+' IS NOT NULL'
      end
    end   # end of Stage 2

    # check that all nodes where bound to table aliases
    pm.each_key do |node|
      raise ProgrammingError,
        "Blank node '#{node}' is not bound to a table" unless b[node]
    end

    @c, @pm, @jc, @b, @table = c, pm, jc.sort.uniq, b, table
  end   # transform_pattern

  attr_reader :c, :pm, :jc, :b, :table

  # validate expression
  #
  # expression := value [ operator expression ]
  #
  # value := blank_node | literal_string | number | '(' expression ')'
  #
  # whitespace between tokens (except inside parentheses) is mandatory
  #
  def validate_expression(string)
    # todo: lexical analyser
    string.split(/[\s()]+/).collect do |token|
      case token
      when BN, PARAMETER, LITERAL, NUMBER, OPERATOR, AGGREGATE
      else
        raise ProgrammingError, "Bad token '#{token}' in expression"
      end
    end
  end

  # translate Squish SELECT query to SQL,
  # return SQL query and a list of parameter values in proper order
  #
  def to_sql(params={})
    raise ProgrammingError, "Wrong query type: select expected" unless
      @type == :select

    # literal condition is and-ed to the join conditions
    where = @jc.dup
    where.push '(' + @literal + ')' unless '' == @literal

    # now put it all together
    sql = 'SELECT ' + @nodes.join(', ') +
      "\nFROM " + @table.collect {|a,t| t + ' ' + a }.join(', ') +
      ((where.length > 0)? "\nWHERE " + where.join("\n  AND ") : '') +
      ((@group == '') ? '' : "\nGROUP BY " + @group) +
      ((@order == '') ? '' : "\nORDER BY " + @order + ' ' + @order_dir)

    # replace blank node names with bindings
    sql.gsub!(/\?\b.+?\b/) {|node| @b[node] or node }
    raise ProgrammingError, "Unexpected '?' in translated query (probably, caused by unmapped blank node): #{sql.gsub(/\s+/, ' ')};" if sql =~ /\?/

    sql, values = SquishQuery.substitute_parameters(sql, params)
    [substitute_literals(sql), *values]
  end
end

end   # module Samizdat
