HtmlAgilityPack is very popular tool for parse html. Most of websites dont have an API so we need to parse html and get exact values that we need.
Installation with NuGet
Install-Package HtmlAgilityPack
Let’s parse NTV(Tv Channel) for program names and times.
Example
Creating classes for the values
class Channel
{
public string Name { get; set; }
public List<ChannelProgram> Programs { get; set; }
public Channel()
{
this.Programs = new List<ChannelProgram>();
}
}
class ChannelProgram
{
public string Name { get; set; }
public string Time { get; set; }
}
Parsing Html
Uri url = new Uri("http://www.ntv.com.tr/tv/yayin-akisi/");
WebClient webclient = new WebClient();
webclient.Encoding = Encoding.UTF8;
string html = webclient.DownloadString(url);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html);
var programList = htmlDocument.DocumentNode.Descendants("ul")
.Single(d => d.Attributes.Contains("class") && d.Attributes["class"].Value.Equals("programmes"))
.SelectNodes("li");
var channel = new Channel();
channel.Name = "NTV";
foreach (var program in programList)
{
var channelProgram = new ChannelProgram();
channelProgram.Time = program.SelectSingleNode("a").Descendants("span")
.Single(d => d.Attributes.Contains("class") && d.Attributes["class"]
.Value.Equals("tv-hour")).InnerText;
channelProgram.Name = program.SelectSingleNode("a").Descendants("span")
.Single(d => d.Attributes.Contains("class") && d.Attributes["class"]
.Value.Equals("programmeTitle")).InnerText.Replace("\r","");
channel.Programs.Add(channelProgram);
}
foreach(var item in channel.Programs)
{
Console.WriteLine(item.Name + " " + item.Time);
}
Console.ReadKey();