Grammar extracted by Vadim Zaytsev, see the Grammar Zoo entry for details: html/cordy/extracted
Source used for this grammar: James R. Cordy, SomeDeveloper, TXL 10.5 HTML Grammar, September 2007
program
, element
, tag
, tag_elements
, singleton_tag
, singleton_tag_end
, singleton_id
, comment_tag
, comment_text
, tag_beg
, tag_end
, attributes
, attribute
, attribute_id
, equals_attribute_value
, attribute_value
, text
, text_unit
), 1 root (program
), 0 top (—), 14 bottom (NL
8, SPON
, punctuation
2, fileref
, url
, stringlit
, x_id
2, SP
5, id
7, token
2, EX
, SPOFF
, number
2, IN
).program ::=
element
element ::= singleton_tag tag text comment_tag tag_beg tag_end
tag ::= "<" id attributes ">" NL IN tag_elements EX "</" id ">" NL
tag_elements ::=
element
singleton_tag ::= "<" singleton_id attributes ">" singleton_tag_end NL "<" id attributes "/>" NL
singleton_tag_end ::= "</" singleton_id ">"
singleton_id ::= "br" "hr" "|" "img" "meta" "base" "basefont" x_id "dt"
comment_tag ::= "<!" comment_text ">" NL
comment_text ::= punctuation SP token
tag_beg ::= "<" id attributes ">" NL
tag_end ::= "</" id ">" NL
attributes ::=
SPOFF attribute
SPONattribute ::=
SP attribute_id equals_attribute_value
attribute_id ::= id x_id
equals_attribute_value ::=
"=" attribute_value
attribute_value ::= stringlit number id url fileref
text ::=
text_unit
NLtext_unit ::= punctuation SP ")" SP SP "(" token "<" number