加入收藏 | 设为首页 | 会员中心 | 我要投稿 温州站长网 (https://www.0577zz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 站长学院 > Asp教程 > 正文

c#中过滤html的正则表达式

发布时间:2021-07-17 08:45:21 所属栏目:Asp教程 来源:互联网
导读:/// summary/// 去除HTML标记/// /summary/// param name=”NoHTML”包括HTML的源码 /param/// returns已经去除后的文字/returnspublic static string NoHTML(string Htmlstring){ //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"script[^]*.*/scri

/// <summary> /// 去除HTML标记 /// </summary> /// <param name=”NoHTML”>包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([rn])[s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("rn", ""); Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; }

C#过滤Html标签及空格

public static string FilterHTML(string HTMLStr) { if (!string.IsNullOrEmpty(HTMLStr)) return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", ""); else return ""; }

写一个静态方法移除HTML标签

#region /// <summary> /// 移除HTML标签 /// </summary> /// <param>HTMLStr</param> public static string ParseTags(string HTMLStr) { return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", ""); } #endregion

取出文本中的图片地址

#region /// <summary> /// 取出文本中的图片地址 /// </summary> /// <param>HTMLStr</param> public static string GetImgUrl(string HTMLStr) { string str = string.Empty; string sPattern = @"^<imgs+[^>]*>"; Regex r = new Regex(@"<imgs+[^>]*s*srcs*=s*([']?)(?<url>S+)'?[^>]*>", RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success) str = m.Result("${url}"); return str; } #endregion

提取HTML代码中文字的C#函数

/// <summary> /// 提取HTML代码中文字的C#函数 /// </summary> /// <param>包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> using System; using System.Text.RegularExpressions; public class StripHTMLTest { public static void Main() { string s = StripHTML( "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>"); Console.WriteLine(s); } public static string StripHTML(string strHtml) { string[]aryReg = { @"<script[^>]*?>.*?</script>", @"<(/s*)?!?((w+:)?w+)(w+(s*=?s*(([""'])([" "'tbnr]|[^7])*?7|w+)|.{0})|s)*?(/s*)?>", @"([rn])[s]+", @ "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @ "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"&#(d+);", @"-->", @"<!--.*n" }; string[]aryRep = { "", "", "", """, "&", "<", ">", " ", "xa1", //chr(161), "xa2", //chr(162), "xa3", //chr(163), "xa9", //chr(169), "", "rn", "" }; string newReg = aryReg[0]; string strOutput = strHtml; for (int i = 0; i < aryReg.Length; i++) { Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } strOutput.Replace("<", ""); strOutput.Replace(">", ""); strOutput.Replace("rn", ""); return strOutput; } }

TempContent 表示包含有html的字符串;
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]+>","");至少一个
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]*>","");任意个 

(编辑:温州站长网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    热点阅读