HTM =
(
§ § параграф
© © знак охраны авторского права (copyright)
® ® символ зарегистрированного товарного знака
™ ™ символ товарного знака
° ° знак градуса
« « левая кавычка (левая ёлочка)
» » правая кавычка (правая ёлочка)
… … многоточие
’ ’ апостроф
„ „ открывающая лапка
“ “ закрывающая лапка
“ “ открывающая английская лапка
” ” закрывающая английская лапка
• • жирная точка
– – короткое тире (см. одноименный § 158)
− − минус
± ± плюс-минус
— — тире
№ № знак номера
)
;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t := UnHTM(UnSlashUnicode(HTM))
; split long line to smaller lines about 40-50 symbols length
;t := RegExReplace(t,".{40,50}(\s)","$0`n")
MsgBox, % t
;MsgBox, % ComUnHTML( HTM )
;MsgBox, % UnHTM( HTM )
ComUnHTML(html) {
oHTML := ComObjCreate("HtmlFile")
oHTML.write(html)
return oHTML.documentElement.innerText
}
UnHTM( HTM ) { ; Remove HTML formatting / Convert to ordinary text by SKAN 19-Nov-2009
Static HT ; Forum Topic: www.autohotkey.com/forum/topic51342.html
IfEqual,HT,, SetEnv,HT, % "ááââ´´ææàà&ååãã&au"
. "mlä&bdquo„¦¦&bull•ç縸¢¢&circˆ©©¤¤&dagger†&dagger‡°"
. "°÷÷ééêêèèððëë&euro€&fnofƒ½½¼¼¾¾>>&h"
. "ellip…ííîî¡¡ìì¿¿ïï««&ldquo“&lsaquo‹&lsquo‘<<&m"
. "acr¯&mdash—µµ··  &ndash–¬¬ññóóôô&oeligœòò&or"
. "dfªººøøõõöö¶¶&permil‰±±££"""»»&rdquo”®"
. "®&rsaquo›&rsquo’&sbquo‚&scaronš§§­¹¹²²³³ßßþþ&tilde˜&tim"
. "es×&trade™úúûûùù¨¨üüýý¥¥ÿÿ"
;HTM := RegExReplace( HTM,"&(\w+;)", "&$1" ) ;!! для обработки &lt;
;HTM := RegExReplace( HTM,"&(#\d+;)", "&$1" ) ;!! для обработки &#60;
TXT := RegExReplace( HTM,"<[^>]+>", " " ) ; Remove all tags between "<" and ">"
Loop, Parse, TXT, &`; ; Create a list of special characters
L := "&" A_LoopField ";", R .= (!(A_Index&1)) ? ( (!InStr(R,L,1)) ? L:"" ) : ""
StringTrimRight, R, R, 1
Loop, Parse, R , `; ; Parse Special Characters
If F := InStr( HT, A_LoopField ) ; Lookup HT Data
StringReplace, TXT,TXT, %A_LoopField%`;, % SubStr( HT,F+StrLen(A_LoopField), 1 ), All
Else If ( SubStr( A_LoopField,2,1)="#" )
StringReplace, TXT, TXT, %A_LoopField%`;, % Chr(SubStr(A_LoopField,3)), All
;TXT := RegExReplace(TXT, " +", " ") ;!! множественные пробелы на один
Return RegExReplace( TXT, "(^\s*|\s*$)") ; Remove leading/trailing white spaces
}
;---------------------------------------
; Convert strings
;---------------------------------------
/*
CP_ACP = 0
CP_OEMCP = 1
CP_MACCP = 2
CP_UTF7 = 65000
CP_UTF8 = 65001
*/
Ansi2Oem(sString)
{
Ansi2Unicode(sString, wString, 0)
Unicode2Ansi(wString, zString, 1)
Return zString
}
Oem2Ansi(zString)
{
Ansi2Unicode(zString, wString, 1)
Unicode2Ansi(wString, sString, 0)
Return sString
}
Ansi2UTF8(sString)
{
Ansi2Unicode(sString, wString, 0)
Unicode2Ansi(wString, zString, 65001)
Return zString
}
UTF82Ansi(zString)
{
Ansi2Unicode(zString, wString, 65001)
Unicode2Ansi(wString, sString, 0)
Return sString
}
Ansi2Unicode(ByRef sString, ByRef wString, CP = 0)
{
nSize := DllCall("MultiByteToWideChar"
, "Uint", CP
, "Uint", 0
, "Uint", &sString
, "int", -1
, "Uint", 0
, "int", 0)
VarSetCapacity(wString, nSize * 2)
DllCall("MultiByteToWideChar"
, "Uint", CP
, "Uint", 0
, "Uint", &sString
, "int", -1
, "Uint", &wString
, "int", nSize)
}
Unicode2Ansi(ByRef wString, ByRef sString, CP = 0)
{
nSize := DllCall("WideCharToMultiByte"
, "Uint", CP
, "Uint", 0
, "Uint", &wString
, "int", -1
, "Uint", 0
, "int", 0
, "Uint", 0
, "Uint", 0)
VarSetCapacity(sString, nSize)
DllCall("WideCharToMultiByte"
, "Uint", CP
, "Uint", 0
, "Uint", &wString
, "int", -1
, "str", sString
, "int", nSize
, "Uint", 0
, "Uint", 0)
}
;-------------------------------------------------
; HTML encode/decode
;------------------------------------------------
UriEncode(str)
{ ; v 0.3 / (w) 24.06.2008 by derRaphael / zLib-Style release
b_Format := A_FormatInteger
data := ""
SetFormat,Integer,H
Loop,Parse,str
if ((Asc(A_LoopField)>0x7f) || (Asc(A_LoopField)<0x30) || (asc(A_LoopField)=0x3d))
data .= "%" . ((StrLen(c:=SubStr(ASC(A_LoopField),3))<2) ? "0" . c : c)
Else
data .= A_LoopField
SetFormat,Integer,%b_format%
return data
}
UriDecode(str)
{ ; v 0.1 / (w) 28.06.2008 by derRaphael / zLib-Style release
Loop,Parse,str,`%
txt := (A_Index=1) ? A_LoopField : txt chr("0x" substr(A_LoopField,1,2)) SubStr(A_LoopField,3)
return txt
}
UnSlashUnicode(s)
{
; unslash unicode sequences like \u0026
; by Mikhail Kuropyatnikov 2009 (micdelt@mail.ru)
rx = \\u([0-9a-fA-F]{4})
pos = 0
loop
{
pos := RegExMatch(s,rx,m,pos+1)
if (pos = 0)
break
StringReplace, s, s, %m%, % Chr("0x" . SubStr(m,3,4))
}
return s
}