A family of Microsoft word processing software products for creating web, email, and print documents.
Yes, I got the same result when working on a large page that included JavaScript and CSS. The macro ran when I stripped those out. That's what I was referring to when I wrote "the macro ran out of memory on large complex pages".
The macro is about 20 years old, but web pages were a lot smaller then. The macro works by selecting the entire page and loading that into a string. The string is too big and overloads the buffer. It's not how I would write a macro today. Frankly, I think it's pretty amazing that 20 year old code can run at all.
The other choice you have is to simply open the HTML file using Word's HTML filter instead of its text filter, then save as a Word document. If you're just after the text in the HTML, save the Word file as text and all markup is removed.
I'm posting the basic code here, in case someone feels like re-writing this from scratch. The HTMLForm is in the template linked to in my previous message. It's nothing fancy, just an interface for choosing which macro to run.:
Public SubToRun$
Sub HTMLCleanup()
Load HTMLForm
HTMLForm.Show
Select Case SubToRun$
Case "DeleteHTML"
Call DeleteHTML
Case "FindAmpersand"
Call FindAmpersand
Case "FindExtended"
Call FindExtended
Case Else
End Select
End Sub
'Part 1 -- Remove HTML comments
Sub DeleteHTML()
Application.ScreenUpdating = False
WordBasic.EditSelectAll
SelText$ = WordBasic.GetText$(WordBasic.GetSelStartPos(), WordBasic.GetSelEndPos())
InitLen = Len(SelText$)
Loop1:
SelText$ = WordBasic.GetText$(WordBasic.GetSelStartPos(), WordBasic.GetSelEndPos())
NewLen = Len(SelText$)
'WordBasic.Print Int(100 - (NewLen * 100 / InitLen)); "%"
StartPos = InStr(1, SelText$, "<!-")
EndPos = InStr(1, SelText$, "->")
If StartPos = 0 Or EndPos = 0 Then GoTo EndSub1
WordBasic.EditGoTo "\StartOfSel"
If StartPos <> 1 Then WordBasic.CharRight StartPos - 1
WordBasic.CharRight EndPos - StartPos + 2, 1
WordBasic.EditClear
Count = Count + 1
WordBasic.ExtendSelection
WordBasic.EditGoTo "\EndOfDoc"
WordBasic.Cancel
GoTo Loop1
EndSub1:
Application.ScreenUpdating = True
Beep
MsgBox Str$(Count) + " '<!-->' codes removed" + Chr$(13) + "End Part 1"
'Part 2 -- Remove HTML proper
WordBasic.EditSelectAll
Application.ScreenUpdating = False
Loop2:
SelText$ = WordBasic.GetText$(WordBasic.GetSelStartPos(), WordBasic.GetSelEndPos())
NewLen = Len(SelText$)
'WordBasic.Print Int(100 - (NewLen * 100 / InitLen)); "%"
StartPos = InStr(1, SelText$, "<")
EndPos = InStr(1, SelText$, ">")
If StartPos = 0 Or EndPos = 0 Then GoTo EndSub2
WordBasic.EditGoTo "\StartOfSel"
If StartPos <> 1 Then WordBasic.CharRight StartPos - 1
WordBasic.CharRight EndPos - StartPos + 1, 1
WordBasic.EditClear
Count = Count + 1
WordBasic.ExtendSelection
WordBasic.EditGoTo "\EndOfDoc"
WordBasic.Cancel
GoTo Loop2
EndSub2:
Application.ScreenUpdating = True
WordBasic.Beep: Beep
MsgBox Str$(Count) + " '<>' codes removed" + Chr$(13) + "End Macro"
End Sub
Sub FindAmpersand()
WordBasic.EditSelectAll
Loop3:
SelText$ = WordBasic.GetText$(WordBasic.GetSelStartPos(), WordBasic.GetSelEndPos())
StartPos = InStr(1, SelText$, "&")
EndPos = InStr(1, SelText$, ";")
If StartPos = 0 Or EndPos = 0 Then GoTo EndSub
WordBasic.EditGoTo "\StartOfSel"
If StartPos <> 1 Then WordBasic.CharRight StartPos - 1
If StartPos > EndPos Then 'Check for ";" not part of HTML code
WordBasic.ExtendSelection
WordBasic.EditGoTo "\EndOfDoc"
WordBasic.Cancel
GoTo Loop3
End If
WordBasic.CharRight EndPos - StartPos + 1, 1
WordBasic.CharColor 9
Count = Count + 1
WordBasic.CharRight 1
WordBasic.ExtendSelection
WordBasic.EditGoTo "\EndOfDoc"
WordBasic.Cancel
GoTo Loop3
EndSub:
MsgBox Str$(Count) + " '&;' codes highlighted blue."
End Sub
Sub FindExtended()
With Selection.Find
.ClearFormatting
.Replacement.ClearFormatting
.Replacement.Font.Color = wdColorRed
.Text = "[^0127-^0255]"
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchWildcards = True
.Execute Replace:=wdReplaceAll
End With
MsgBox "Extended ASCII characters highlighted red."
End Sub