Uz Galaxy - Info Archive top BBS   RSS
Info Archive  Network - オリジナル HTML Parser (HTMLファイル解析)  

HTMLファイルの構造解析に使用するため、独自の HTML Parser を作成しました。
(スクリプトが絡む部分の処理が怪しいし、コードがごちゃごちゃして汚いけど。)

使用例:
    Private Sub Form2_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
        Dim sFileName As String = "C:\Projects\InfoArchive\bin\Template\Info2.html"
        Dim html As New HtmlParser
        If Not html.Load(sFileName) Then
            Return
        End If

        Dim I As Integer
        For I = 1 To html.Nodes.Count
            AddTree(html.Nodes(I))
        Next
        tvwHtml.ExpandAll()
        tvwHtml.SelectedNode = tvwHtml.Nodes(0)
        tvwHtml.SelectedNode.EnsureVisible()
    End Sub

    Public Sub AddTree(ByVal html As HtmlNode, Optional ByVal oNode As TreeNode = Nothing)
        If html.TagType = HtmlNode.HtmlTagType.NoneTag Then
            If html.Html = vbCrLf Then
                Return
            End If
        ElseIf html.TagType = HtmlNode.HtmlTagType.EndTag Then
            Return
        End If

        Dim oHtmlNode As TreeNode

        If oNode Is Nothing Then
            oHtmlNode = tvwHtml.Nodes.Add(html.Name)
        Else
            oHtmlNode = oNode.Nodes.Add(html.Name)
        End If
        oHtmlNode.Tag = html

        If html.TagType = HtmlNode.HtmlTagType.CommentTag Then
            oHtmlNode.Text = "<!-- -->"
        ElseIf html.TagType = HtmlNode.HtmlTagType.NoneTag Then
            If html.Html = vbCrLf Then
                oHtmlNode.Text = "  "
            Else
                oHtmlNode.Text = html.Html
            End If
        End If

        Dim I As Integer
        For I = 1 To html.ChildNode.Count
            AddTree(html.ChildNode(I), oHtmlNode)
        Next
    End Sub


各クラスの簡単な説明:
HtmlParser クラス
  Nodes As Collection          ' タグ(HtmlNode)コレクション

HtmlNodeAttribute クラス
  Name As String               ' 属性名
  Value As String              ' 属性値

HtmlTagType 列挙型
  StartTag                     ' 開始タグ
  EndTag                       ' 終了タグ
  OmitTag                      ' 空要素タグ(開始が終了タグもかねる。img など終了タグが必要ないもの。/>で終わるもの。)
  CommentTag                   ' コメントタグ
  NoneTag                      ' タグのない要素(文字列)

HtmlNode クラス
  Name As String               ' タグの名前
  TagType As HtmlTagType       ' タグの種類
  Attributes As Collection     ' 属性情報 (HtmlNodeAttribute)のコレクション
  ChildNode As NodeCollection  ' 子タグ (HtmlNode)のコレクション
  Html As String               ' タグの HTML文
  Parent As HtmlNode           ' 親タグ

これらのクラスがツリー構造で格納されます。
HtmlParser
HtmlNodes
HtmlNode
HtmlAttributes
HtmlAttribute

ソースコード:
Public Class HtmlParser
    Public Nodes As New Collection

    Public Sub New()
    End Sub

    Public Sub New(ByVal FileName As String)
        Load(FileName)
    End Sub

    Public Function Load(ByVal path As String) As Boolean
        ' データクリア
        While Nodes.Count > 0
            Nodes.Remove(1)
        End While

        Dim sr As New System.IO.StreamReader(path, System.Text.Encoding.GetEncoding("shift-jis"))

        Dim sLine As String
        Dim nPos As Integer
        Dim bTag As Boolean
        Dim sTag As String
        Dim bComment As Boolean = False
        Dim bStringD As Boolean
        Dim bStringS As Boolean

        Dim sInnerHTML As String = ""
        Dim colToken As New Collection

        Dim sLastTag As String

        Do While sr.Peek() >= 0
            sLine = sr.ReadLine()

            nPos = 0
            While nPos < sLine.Length
                Dim c As String
                c = sLine.Substring(nPos, 1)

                If bComment Then
                    sInnerHTML += c

                    If bStringD = False And bStringS = False Then
                        If c = """" Then
                            bStringD = True
                        ElseIf c = "'" Then
                            If sLastTag.ToLower() <> "vb" Then
                                bStringS = True
                            Else
                                ' 行末まで読み込む
                                nPos += 1
                                sInnerHTML += sLine.Substring(nPos)
                                nPos = sLine.Length
                            End If
                        End If
                    ElseIf bStringD = True Then
                        If c = """" Then
                            bStringD = False
                        End If
                    ElseIf bStringS = True Then
                        If c = """" Then
                            bStringS = False
                        End If
                    End If

                    If (Not bStringD) AndAlso (Not bStringS) AndAlso c = ">" Then
                        If sInnerHTML.Length > 3 AndAlso sInnerHTML.Substring(sInnerHTML.Length - 3, 3) = "-->" Then
                            colToken.Add(sInnerHTML)
                            sInnerHTML = ""
                            bComment = False
                        End If
                    End If
                Else

                    Select Case c
                        Case "<"

                            If bTag Then
                                sInnerHTML += c
                            Else
                                If sInnerHTML.Length > 0 Then
                                    colToken.Add(sInnerHTML)
                                End If
                                sInnerHTML = ""

                                If sLine.Substring(nPos, 4) = "<!--" Then
                                    bComment = True
                                    bStringD = False
                                    bStringS = False
                                    sInnerHTML = c
                                Else
                                    bTag = True
                                    sTag = c
                                End If

                            End If

                        Case ">"
                            If bTag Then
                                sTag += c
                                If sTag.Length > 0 Then
                                    colToken.Add(sTag)

                                    Dim nTagType As HtmlNode.HtmlTagType = HtmlNode.GetTagType(sTag)
                                    sLastTag = HtmlNode.GetTagName(sTag, nTagType)
                                End If
                                sTag = ""

                                bTag = False
                            End If
                        Case Else
                            If bTag Then
                                sTag += c
                            Else
                                sInnerHTML += c
                            End If
                    End Select
                End If

                nPos += 1
            End While

            If bComment Then
                sInnerHTML += vbCrLf
            Else
                If Not bTag Then
                    If sInnerHTML.Length > 0 Then
                        colToken.Add(sInnerHTML)
                        sInnerHTML = ""
                    End If

                    colToken.Add(vbCrLf)
                Else
                    sTag += vbCrLf
                End If
            End If
        Loop
        sr.Close()

        Dim I As Integer
        For I = 1 To colToken.Count

            Dim html As New HtmlNode
            html.SetHtml(colToken, I)
            Nodes.Add(html)
        Next

        Return True
    End Function

    Public Function MakeInnerHtml() As String
        Dim sInnerHtml As String = ""
        Dim I As Integer
        For I = 1 To Me.Nodes.Count
            sInnerHtml += MakeInnerHtml(Me.Nodes.Item(I))
        Next
        Return sInnerHtml
    End Function

    Private Function MakeInnerHtml(ByVal html As HtmlNode) As String
        Dim sInnerHtml As String = html.Html
        Dim I As Integer
        For I = 1 To html.ChildNode.Count
            sInnerHtml += MakeInnerHtml(html.ChildNode.Item(I))
        Next
        Return sInnerHtml
    End Function

    Public Function Save(ByVal path As String) As Boolean
        Dim sInnerHtml As String = MakeInnerHtml()
        Dim sw As New IO.StreamWriter(path, False, System.Text.Encoding.GetEncoding("shift-jis"))
        sw.Write(sInnerHtml)
        sw.Flush()
        sw.Close()
    End Function
End Class

Public Class HtmlNodeAttribute
    Public Name As String
    Public Value As String
End Class

Public Class HtmlNode
    Public Enum HtmlTagType As Integer
        StartTag
        EndTag
        OmitTag
        CommentTag
        NoneTag
    End Enum

    Private _name As String
    Public TagType As HtmlTagType
    Public Attributes As New Collection

    Public ChildNode As New Collection

    Public Html As String
    Private InnerHtml As String

    Public Parent As HtmlNode

    Public Property Name() As String
        Get
            Return _name
        End Get
        Set(ByVal Value As String)
            _name = Value
            ChangeTagName(Value)

            ' 終了タグの名前も変える
            If TagType = HtmlTagType.StartTag Then
                If ChildNode.Count > 0 Then
                    Dim child As HtmlNode = ChildNode.Item(ChildNode.Count)
                    child.Name = Value
                    child.ChangeTagName(Value)
                End If
            End If
        End Set
    End Property

    Private Sub ChangeTagName(ByVal sName As String)
        Html = "<"
        If Me.TagType = HtmlTagType.EndTag Then
            Html += "/"
        End If
        Html += sName
        Dim I As Integer
        Dim att As HtmlNodeAttribute
        For I = 1 To Attributes.Count
            att = CType(Attributes.Item(I), HtmlNodeAttribute)
            Html += " " & att.Name
            If att.Value.Length > 0 Then
                Html += "=" & att.Value
            End If
        Next
        If Me.TagType = HtmlTagType.OmitTag Then
            Html += "/>"
        Else
            Html += ">"
        End If
    End Sub

    Public Sub SetHtml(ByRef colText As Collection, ByRef nIndex As Integer)
        If nIndex > colText.Count Then Return

        Dim sItem As String = colText(nIndex)
        nIndex += 1

        TagType = GetTagType(sItem)
        _name = GetTagName(sItem, TagType, Attributes).ToLower
        Html = sItem

        Dim sOmitTagArray() As String = {"img", "br", "hr", "meta"}
        Dim I As Integer
        For I = 0 To UBound(sOmitTagArray)
            If sOmitTagArray(I) = _name Then
                TagType = HtmlTagType.OmitTag
            End If
        Next

        InnerHtml = sItem

        Select Case TagType
            Case HtmlTagType.OmitTag, _
                 HtmlTagType.NoneTag, _
                 HtmlTagType.CommentTag
                ' 単一タグ

            Case HtmlTagType.StartTag
                ' 開始タグ

                ' 終了タグまでループ
                Do
                    If nIndex > colText.Count Then Exit Do

                    Dim child As New HtmlNode
                    child.Parent = Me
                    child.SetHtml(colText, nIndex)
                    InnerHtml += child.InnerHtml

                    ChildNode.Add(child)
                    If child.Name = Me.Name Then
                        'child = Nothing
                        Exit Do
                    Else
                    End If
                Loop
            Case HtmlTagType.EndTag
        End Select
    End Sub

    Public Shared Function GetTagType(ByVal sText As String) As HtmlNode.HtmlTagType
        If sText.Length < 2 Then Return HtmlTagType.NoneTag

        ' タグタイプの識別
        Dim nTagType = HtmlTagType.NoneTag
        If sText.Substring(0, 2) = "</" Then
            nTagType = HtmlTagType.EndTag
        Else
            If sText.Length > 3 AndAlso sText.Substring(0, 1) = "<" AndAlso sText.Substring(sText.Length - 2, 2) = "/>" Then
                nTagType = HtmlTagType.OmitTag
            ElseIf sText.Length > 7 AndAlso sText.Substring(0, 4) = "<!--" AndAlso sText.Substring(sText.Length - 3, 3) = "-->" Then
                nTagType = HtmlTagType.CommentTag
            Else
                If sText.Substring(0, 1) = "<" Then
                    nTagType = HtmlTagType.StartTag
                End If
            End If
        End If
        Return nTagType
    End Function

    Public Shared Function GetTagName(ByVal sText As String, ByVal nTagType As HtmlNode.HtmlTagType, Optional ByRef colAttribute As Collection = Nothing) As String
        If sText.Length < 2 Then Return ""

        Dim sName As String = ""
        Dim nPos As Integer = 1
        Select Case nTagType
            Case HtmlTagType.StartTag, HtmlTagType.OmitTag
                nPos = 1
            Case HtmlTagType.EndTag
                nPos = 2
            Case Else
                Return ""
        End Select

        While sText.Substring(nPos, 1) = ""
            nPos += 1
        End While

        Dim c As String
        Do
            c = sText.Substring(nPos, 1)
            If c = " " OrElse c = ">" Then
                Exit Do
            End If
            sName += c
            nPos += 1
        Loop

        If Not colAttribute Is Nothing AndAlso nPos < sText.Length Then
            ' 属性情報分解
            Dim sBlock As String = ""
            While colAttribute.Count > 0
                colAttribute.Remove(1)
            End While
            Dim col As New Collection
            Dim bStringD As Boolean = False
            Dim bStringS As Boolean = False
            Do
                c = sText.Substring(nPos, 1)
                If c = "'" Then
                    bStringS = Not bStringS
                    sBlock += c
                ElseIf c = """" Then
                    bStringD = Not bStringD
                    sBlock += c
                ElseIf Not bStringS AndAlso Not bStringD AndAlso (c = " " Or c = "/" Or c = ">") Then
                    If sBlock.Length > 0 Then
                        col.Add(sBlock)
                    End If
                    sBlock = ""
                Else
                    sBlock += c
                End If
                nPos += 1
                If nPos >= sText.Length Then
                    Exit Do
                End If
            Loop

            Dim I As Integer
            For I = 1 To col.Count
                c = col(I)
                Dim att As New HtmlNodeAttribute
                nPos = c.IndexOf("=")
                If nPos < 0 Then
                    att.Name = c
                Else
                    att.Name = c.Substring(0, nPos).Trim()
                    att.Value = c.Substring(nPos + 1).Trim()
                End If
                colAttribute.Add(att)

                'Debug.WriteLine(sName & " " & att.Name & " = " & att.Value)
            Next
        End If

        Return sName
    End Function
End Class


Copyright © 2004 Uz. All rights reserved.
http://uzgalaxy.com/
このページはInfoArchiveで作成されています。