最近开始尝试使用 .NET Core
的 HttpClient
来学习写爬虫程序,第一步就遇到了某个使用 GB2312
编码的网页, 爬到的结果里面中文都是乱码, 于是想当然地在 Headers
里面添加了 "Content-Type: text/html; charset=utf-8"
, 却被无情地抛出一个不支持这个头部的异常,于是走了很多的弯路,才发觉,这个问题原来是 HttpClient
的默认字符集支持问题 。
创建 HttpClient 服务接口 在 .Application 项目中添加名为 ISampleClient
的接口,其代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 using System; using System.Collections.Generic;using System.Text;using System.Threading.Tasks;namespace Marcin.Application { public interface ISampleClient { Task<string > GetDataAsync (string uri, string charset = "UTF-8" ) ; } }
实现 HttpClient 服务接口 HttpClient
默认不支持 GB2312
和 GBK
, 所以如果使用它来下载, 网页编码为 GB2312
和 GBK
的话,其内的中文将显示为乱码。
解决办法: 添加 NuGet
包:System.Text.Encoding.CodePages, 在代码中还需要添加注册 EncodingProvider
的方法 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
添加接口的实现类 SampleClient
,其代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 using Microsoft.Extensions.Logging;using System;using System.Collections.Generic;using System.Net;using System.Net.Http;using System.Text;using System.Threading.Tasks;using HtmlAgilityPack;using System.IO;using System.Diagnostics;namespace Marcin.Application { public class SampleClient : ISampleClient { private readonly HttpClient _httpClient; private readonly ILogger _logger; public SampleClient (ILogger<SampleClient> logger, HttpClient httpClient ) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); httpClient.DefaultRequestHeaders.Add("Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ); httpClient.DefaultRequestHeaders.Add("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0" ); _httpClient = httpClient; _logger = logger; } public async Task<string > GetDataAsync (string uri, string charset = "UTF-8" ) { _logger.LogInformation("SampleClient {0} at {1}" , "Started" , DateTime.UtcNow); Stopwatch sw = new Stopwatch(); sw.Start(); try { var response = await _httpClient.GetAsync(uri).ConfigureAwait(false ); var result = await response.Content.ReadAsStreamAsync(); var stream = new StreamReader(result, Encoding.GetEncoding(charset)); return stream.ReadToEnd(); } catch (HttpRequestException hre) { hre.ToString(); throw ; } catch (Exception ex) { throw ; } finally { sw.Stop(); _logger.LogInformation("response.Content.ReadAsStreamAsync time cost: {0} " , sw.ElapsedMilliseconds.ToString()); } } } }
===END===