luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

parser.lua (6026B)


      1 -- parser.lua - Simple HTML Parser
      2 
      3 local dom = require("dom", true)  -- Use bytecode if available
      4 local Element = dom.Element
      5 
      6 local Parser = {}
      7 Parser.__index = Parser
      8 
      9 function Parser.new()
     10     local self = setmetatable({}, Parser)
     11     self.html = ""
     12     self.pos = 1
     13     return self
     14 end
     15 
     16 function Parser:peek(len)
     17     len = len or 1
     18     return self.html:sub(self.pos, self.pos + len - 1)
     19 end
     20 
     21 function Parser:advance(len)
     22     self.pos = self.pos + (len or 1)
     23 end
     24 
     25 function Parser:skipWhitespace()
     26     while self.pos <= #self.html do
     27         local c = self.html:sub(self.pos, self.pos)
     28         if c == " " or c == "\t" or c == "\n" or c == "\r" then
     29             self.pos = self.pos + 1
     30         else
     31             break
     32         end
     33     end
     34 end
     35 
     36 function Parser:parseTagName()
     37     local tag = self.html:match("^([%w%-]+)", self.pos)
     38     if tag then
     39         self.pos = self.pos + #tag
     40         return tag:lower()
     41     end
     42     return nil
     43 end
     44 
     45 function Parser:parseAttributes()
     46     local attrs = {}
     47 
     48     while self.pos <= #self.html do
     49         self:skipWhitespace()
     50 
     51         local c = self:peek(1)
     52         if c == ">" or self:peek(2) == "/>" then
     53             break
     54         end
     55 
     56         -- Parse attribute name
     57         local name = self.html:match("^([%w%-]+)", self.pos)
     58         if not name then break end
     59         self.pos = self.pos + #name
     60 
     61         self:skipWhitespace()
     62 
     63         -- Check for '='
     64         if self:peek(1) == "=" then
     65             self:advance(1)
     66             self:skipWhitespace()
     67 
     68             -- Parse value
     69             local quote = self:peek(1)
     70             if quote == '"' or quote == "'" then
     71                 self:advance(1)
     72                 local valueEnd = self.html:find(quote, self.pos, true)
     73                 if valueEnd then
     74                     attrs[name] = self.html:sub(self.pos, valueEnd - 1)
     75                     self.pos = valueEnd + 1
     76                 end
     77             else
     78                 -- Unquoted value
     79                 local value = self.html:match("^([^%s>]+)", self.pos)
     80                 if value then
     81                     attrs[name] = value
     82                     self.pos = self.pos + #value
     83                 end
     84             end
     85         else
     86             attrs[name] = true
     87         end
     88     end
     89 
     90     return attrs
     91 end
     92 
     93 function Parser:parseText()
     94     local text = self.html:match("^([^<]+)", self.pos)
     95     if text then
     96         self.pos = self.pos + #text
     97         -- Collapse whitespace
     98         text = text:gsub("%s+", " ")
     99         return text
    100     end
    101     return nil
    102 end
    103 
    104 function Parser:parseElement()
    105     self:skipWhitespace()
    106 
    107     if self:peek(1) ~= "<" then
    108         return nil
    109     end
    110 
    111     self:advance(1)
    112 
    113     -- Check for closing tag
    114     if self:peek(1) == "/" then
    115         return nil
    116     end
    117 
    118     -- Check for comment
    119     if self:peek(3) == "!--" then
    120         local commentEnd = self.html:find("-->", self.pos, true)
    121         if commentEnd then
    122             self.pos = commentEnd + 3
    123         end
    124         return nil
    125     end
    126 
    127     -- Check for DOCTYPE
    128     if self:peek(1) == "!" then
    129         local gtPos = self.html:find(">", self.pos, true)
    130         if gtPos then
    131             self.pos = gtPos + 1
    132         end
    133         return nil
    134     end
    135 
    136     local tag = self:parseTagName()
    137     if not tag then return nil end
    138 
    139     local attrs = self:parseAttributes()
    140 
    141     -- Find end of opening tag
    142     local gtPos = self.html:find(">", self.pos, true)
    143     if not gtPos then return nil end
    144     self.pos = gtPos + 1
    145 
    146     local elem = Element.new(tag, attrs)
    147 
    148     -- Self-closing tags
    149     local selfClosing = {
    150         br = true, hr = true, img = true, input = true,
    151         meta = true, link = true, area = true, base = true,
    152         col = true, embed = true, source = true, track = true,
    153         wbr = true
    154     }
    155 
    156     if selfClosing[tag] then
    157         return elem
    158     end
    159 
    160     -- Script and style: skip content
    161     if tag == "script" or tag == "style" then
    162         local closePattern = "</" .. tag .. ">"
    163         local closePos = self.html:find(closePattern, self.pos, true)
    164         if closePos then
    165             local content = self.html:sub(self.pos, closePos - 1)
    166             elem:setText(content)
    167             self.pos = closePos + #closePattern
    168         end
    169         return elem
    170     end
    171 
    172     -- Parse children
    173     while self.pos <= #self.html do
    174         self:skipWhitespace()
    175 
    176         -- Check for closing tag
    177         if self:peek(2) == "</" then
    178             local savePos = self.pos
    179             self:advance(2)
    180             local closeTag = self:parseTagName()
    181             if closeTag == tag then
    182                 local closeGt = self.html:find(">", self.pos, true)
    183                 if closeGt then
    184                     self.pos = closeGt + 1
    185                 end
    186                 break
    187             else
    188                 self.pos = savePos
    189                 break
    190             end
    191         end
    192 
    193         -- Try to parse child element
    194         if self:peek(1) == "<" then
    195             local child = self:parseElement()
    196             if child then
    197                 elem:addChild(child)
    198             end
    199         else
    200             local text = self:parseText()
    201             if text and text:match("%S") then
    202                 local textNode = Element.new("text", {})
    203                 textNode:setText(text)
    204                 elem:addChild(textNode)
    205             end
    206         end
    207     end
    208 
    209     return elem
    210 end
    211 
    212 function Parser:parse(html)
    213     self.html = html
    214     self.pos = 1
    215 
    216     local root = Element.new("body", {})
    217 
    218     -- Try to find <body>
    219     local bodyStart = html:find("<body")
    220     local bodyEnd = html:find("</body>")
    221 
    222     if bodyStart then
    223         local bodyOpenEnd = html:find(">", bodyStart, true)
    224         if bodyOpenEnd then
    225             self.pos = bodyOpenEnd + 1
    226         end
    227     end
    228 
    229     local endPos = bodyEnd or #html
    230 
    231     while self.pos < endPos do
    232         self:skipWhitespace()
    233         if self.pos >= endPos then break end
    234 
    235         local elem = self:parseElement()
    236         if elem then
    237             root:addChild(elem)
    238         else
    239             self:advance(1)
    240         end
    241     end
    242 
    243     return root
    244 end
    245 
    246 return {
    247     Parser = Parser
    248 }