parser.lua (6026B)
1 -- parser.lua - Simple HTML Parser 2 3 local dom = require("dom", true) -- Use bytecode if available 4 local Element = dom.Element 5 6 local Parser = {} 7 Parser.__index = Parser 8 9 function Parser.new() 10 local self = setmetatable({}, Parser) 11 self.html = "" 12 self.pos = 1 13 return self 14 end 15 16 function Parser:peek(len) 17 len = len or 1 18 return self.html:sub(self.pos, self.pos + len - 1) 19 end 20 21 function Parser:advance(len) 22 self.pos = self.pos + (len or 1) 23 end 24 25 function Parser:skipWhitespace() 26 while self.pos <= #self.html do 27 local c = self.html:sub(self.pos, self.pos) 28 if c == " " or c == "\t" or c == "\n" or c == "\r" then 29 self.pos = self.pos + 1 30 else 31 break 32 end 33 end 34 end 35 36 function Parser:parseTagName() 37 local tag = self.html:match("^([%w%-]+)", self.pos) 38 if tag then 39 self.pos = self.pos + #tag 40 return tag:lower() 41 end 42 return nil 43 end 44 45 function Parser:parseAttributes() 46 local attrs = {} 47 48 while self.pos <= #self.html do 49 self:skipWhitespace() 50 51 local c = self:peek(1) 52 if c == ">" or self:peek(2) == "/>" then 53 break 54 end 55 56 -- Parse attribute name 57 local name = self.html:match("^([%w%-]+)", self.pos) 58 if not name then break end 59 self.pos = self.pos + #name 60 61 self:skipWhitespace() 62 63 -- Check for '=' 64 if self:peek(1) == "=" then 65 self:advance(1) 66 self:skipWhitespace() 67 68 -- Parse value 69 local quote = self:peek(1) 70 if quote == '"' or quote == "'" then 71 self:advance(1) 72 local valueEnd = self.html:find(quote, self.pos, true) 73 if valueEnd then 74 attrs[name] = self.html:sub(self.pos, valueEnd - 1) 75 self.pos = valueEnd + 1 76 end 77 else 78 -- Unquoted value 79 local value = self.html:match("^([^%s>]+)", self.pos) 80 if value then 81 attrs[name] = value 82 self.pos = self.pos + #value 83 end 84 end 85 else 86 attrs[name] = true 87 end 88 end 89 90 return attrs 91 end 92 93 function Parser:parseText() 94 local text = self.html:match("^([^<]+)", self.pos) 95 if text then 96 self.pos = self.pos + #text 97 -- Collapse whitespace 98 text = text:gsub("%s+", " ") 99 return text 100 end 101 return nil 102 end 103 104 function Parser:parseElement() 105 self:skipWhitespace() 106 107 if self:peek(1) ~= "<" then 108 return nil 109 end 110 111 self:advance(1) 112 113 -- Check for closing tag 114 if self:peek(1) == "/" then 115 return nil 116 end 117 118 -- Check for comment 119 if self:peek(3) == "!--" then 120 local commentEnd = self.html:find("-->", self.pos, true) 121 if commentEnd then 122 self.pos = commentEnd + 3 123 end 124 return nil 125 end 126 127 -- Check for DOCTYPE 128 if self:peek(1) == "!" then 129 local gtPos = self.html:find(">", self.pos, true) 130 if gtPos then 131 self.pos = gtPos + 1 132 end 133 return nil 134 end 135 136 local tag = self:parseTagName() 137 if not tag then return nil end 138 139 local attrs = self:parseAttributes() 140 141 -- Find end of opening tag 142 local gtPos = self.html:find(">", self.pos, true) 143 if not gtPos then return nil end 144 self.pos = gtPos + 1 145 146 local elem = Element.new(tag, attrs) 147 148 -- Self-closing tags 149 local selfClosing = { 150 br = true, hr = true, img = true, input = true, 151 meta = true, link = true, area = true, base = true, 152 col = true, embed = true, source = true, track = true, 153 wbr = true 154 } 155 156 if selfClosing[tag] then 157 return elem 158 end 159 160 -- Script and style: skip content 161 if tag == "script" or tag == "style" then 162 local closePattern = "</" .. tag .. ">" 163 local closePos = self.html:find(closePattern, self.pos, true) 164 if closePos then 165 local content = self.html:sub(self.pos, closePos - 1) 166 elem:setText(content) 167 self.pos = closePos + #closePattern 168 end 169 return elem 170 end 171 172 -- Parse children 173 while self.pos <= #self.html do 174 self:skipWhitespace() 175 176 -- Check for closing tag 177 if self:peek(2) == "</" then 178 local savePos = self.pos 179 self:advance(2) 180 local closeTag = self:parseTagName() 181 if closeTag == tag then 182 local closeGt = self.html:find(">", self.pos, true) 183 if closeGt then 184 self.pos = closeGt + 1 185 end 186 break 187 else 188 self.pos = savePos 189 break 190 end 191 end 192 193 -- Try to parse child element 194 if self:peek(1) == "<" then 195 local child = self:parseElement() 196 if child then 197 elem:addChild(child) 198 end 199 else 200 local text = self:parseText() 201 if text and text:match("%S") then 202 local textNode = Element.new("text", {}) 203 textNode:setText(text) 204 elem:addChild(textNode) 205 end 206 end 207 end 208 209 return elem 210 end 211 212 function Parser:parse(html) 213 self.html = html 214 self.pos = 1 215 216 local root = Element.new("body", {}) 217 218 -- Try to find <body> 219 local bodyStart = html:find("<body") 220 local bodyEnd = html:find("</body>") 221 222 if bodyStart then 223 local bodyOpenEnd = html:find(">", bodyStart, true) 224 if bodyOpenEnd then 225 self.pos = bodyOpenEnd + 1 226 end 227 end 228 229 local endPos = bodyEnd or #html 230 231 while self.pos < endPos do 232 self:skipWhitespace() 233 if self.pos >= endPos then break end 234 235 local elem = self:parseElement() 236 if elem then 237 root:addChild(elem) 238 else 239 self:advance(1) 240 end 241 end 242 243 return root 244 end 245 246 return { 247 Parser = Parser 248 }