# -*- coding: utf-8 -*-

'''
    HTMLParserEx & ccsSelector
    Copyright (C) 2015 Hamid_PaK

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''

import re
import copy
import traceback

from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree


class HTMLParserEx(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.reInt = re.compile(r'\D')
    self.reScriptTags = re.compile(r'(<script\b[^>]*>)(.*?)(<\/script>)', re.DOTALL | re.IGNORECASE)
    self.reSelfClosingTags = re.compile(r'((<(img|br|hr|meta|link|area|base|col|embed|input|keygen|param|source|track|wbr)\b.*?)/?>)', re.DOTALL | re.IGNORECASE)
    self.reInvalidClosingTags = re.compile(r'</(\w+)\s+.*>', re.DOTALL)

    self.reAllOpeningTags = re.compile(r'(<[A-Z][A-Z0-9]*)\b(.*?)(/?>)', re.DOTALL | re.IGNORECASE)

    self.reTagAttrs = re.compile(r'([_a-z]+[_a-z0-9-]*)(=)["\']{0,1}([^"\']*)', re.DOTALL | re.IGNORECASE)
    self.reQuoteFix = re.compile(r'="([^\'\<\>\=]+\'[^\"\<\>]+)"')


  def handle_starttag(self, tag, attributes):
    try:
      self.tb.start(tag, dict(attributes))
    except: pass


  def handle_endtag(self, tag):
    try:
      self.tb.end(tag)
    except: pass


  def handle_data(self, data):
    if data.find('[REPLACE_SCRIPT:') >= 0:
      try:
        idx = int( self.reInt.sub( '', data ))
        self.tb.data( self.scripts[ idx ][1] )
      except:
        self.tb.data( data )
    else:
      self.tb.data( data )


  def close(self):
    try:
      HTMLParser.close(self)
      return self.tb.close()
    except:
      return False


  def feed(self, text):
    try:
      self.tb = etree.TreeBuilder()

      self.scripts = self.reScriptTags.findall( text )
      self.scriptIndex = 0

      data = self.reScriptTags.sub( lambda m: self.getScriptReplacement(m), text )
      for x in self.reQuoteFix.findall( data ):
        data = data.replace(x, x.replace("\'", '&apos;').replace("'", '&apos;'))
      data = self.reSelfClosingTags.sub( r'\2/>', data)
      data = self.reInvalidClosingTags.sub( r'</\1>', data )
      data = self.reAllOpeningTags.sub( lambda m: self.getCleanOpeningTag(m), data )

      if data.lower().find('<html') < 0:
        data = '<html>%s</html>' % data

      HTMLParser.feed( self, data )

      return data
    except:
      traceback.print_exc()
    return


  def getScriptReplacement(self, match):
    try:
      self.scriptIndex += 1
      return '{scriptOpening}[REPLACE_SCRIPT:{scriptIndex}]{scriptClosing}'.format(scriptOpening=match.group(1), scriptIndex=self.scriptIndex, scriptClosing=match.group(3))
    except:
      traceback.print_exc()
    return


  def getCleanOpeningTag(self, match):
    result = r'' + match.group(1)
    for attr in self.reTagAttrs.findall( match.group(2) ):
      result += r' ' + attr[0] + attr[1] + r'"' + attr[2] + r'"'
    result += match.group(3)
    return result



class cssSelector(object):
  def __init__(self, element=None, selector='*'):
    try:
      element.set('_doc', True)
      self.root = element
    except:
      self.root = etree.Element('html')
      if isinstance(element, list):
        for x in element:
          self.root.append(x)
      pass

    self.selector = selector
    self.selected = []
    self.reAttrs = re.compile('^(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)(\*?\|?\^?~?=)?(.+)?$')


  def parseSelector(self, selector):
    result = []
    select = {}

    idx = 'tag'

    for i, letter in enumerate( selector ):
      if idx == 'attr' and not letter in ['[', ']']:
        pass

      elif letter == '#':
        idx = 'id'
        continue

      elif letter == '.':
        idx = 'class'
        if idx in select:
          select[ idx ] += ','
        continue

      elif letter == '[':
        idx = 'attr'
        if idx in select:
          select[ idx ] += '|||'
        continue

      elif letter == ']':
        idx = 'tag'
        continue

      elif letter == ':':
        idx = 'pseudo'
        continue

      elif letter in [' ']:
        idx = 'tag'
        result.append( select )
        select = {}
        continue

      if not idx in select:
        select[ idx ] = ''

      select[ idx ] += letter

      # print letter, idx, select

    if len( select ):
      result.append( select )

    return result

  def domSelector(self, element, select):
    result = []

    tagCondition = True
    idCondition = True
    classCondition = True
    attrCondition = True

    for item in select:
      if 'tag' in item:
        tagCondition = element.tag.lower() == item[ 'tag' ].lower()

      if 'id' in item:
        idCondition = element.get('id', '').lower() == item[ 'id' ].lower()

      if 'class' in item:
        classes = item[ 'class' ].lower().split(',')
        tagClasses = element.get('class', '').lower().split(' ')
        intersect = []
        for c in tagClasses:
          for cc in classes:
            if c == cc: intersect.append( c )

        classCondition = bool(len( intersect ))

      if 'attr' in item:
        attrs = item[ 'attr' ].split('|||')

        for attr in attrs:
          attrDict = self.reAttrs.match(attr)

          if not attrDict:
            continue

          attrDict = attrDict.groups()
          elAttrValue = element.get( attrDict[0].lower(), '' )

          if not attrDict[0]:
            continue

          elif attrDict[1] == None or attrDict[2] == None:
            attrCondition = attrDict[0].lower() in element.attrib

            print attrDict, attrCondition

          elif attrDict[1] == '*=':
            attrCondition = elAttrValue.lower().find( attrDict[2].lower().strip('\'"') ) >=0
          elif attrDict[1] in ['^=', '|=']:
            attrCondition = elAttrValue.lower().find( attrDict[2].lower().strip('\'"') ) ==0
          elif attrDict[1] == '=':
            attrCondition = elAttrValue.lower() == attrDict[2].lower().strip('\'"')

    if tagCondition and idCondition and classCondition and attrCondition:
      result.append( element )

    try:
      for child in element.getchildren():
        result += self.domSelector( child, select )
    except AttributeError:
      for child in list(elements):
        result += self.domSelector( child, select )

    return result

  def find(self, selector=None, elements=None):
    if selector is None:
      selector = self.selector

    select = self.parseSelector( selector )

    result = []

    if not elements:
      if len( self.selected ):
        elements = self.selected
      else:
        elements = [self.root]
    elif not isinstance(elements, list):
      elements = [elements]

    #print select, elements, self.root

    for el in elements:
      result += self.domSelector( el, select )

    if len( result ):
      self.selected = result

    return bool(len( result ))


  def copy(self):
    return copy.copy(self)

