ScrapyFSharp


Simulate a browser

This module help you to navigate on a website. Cookies, Referer, etc are automatically handled.

Openning namespaces

1: 
2: 
3: 
open System
open System.Net
open ScrapyFSharp.Network

Creating a browser

1: 
let b1 = ScrapingBrowser()

Some fake user agents are available

1: 
b1.UserAgent <- FakeUserAgent.Chrome

Enable redirect headers hanling

1: 
b1.AllowAutoRedirect <- true

Enable redirects via HTML meta like 'meta http-equiv="refresh"'

1: 
b1.AllowMetaRedirect <- true

Sometimes, default .Net cookies parser can throw exception when you are scraping old java websites So, you can use this workaround.

1: 
b1.UseDefaultCookiesParser <- false

If you are scraping GZIP compressed web pages, you can specify the decompression method.

1: 
b1.DecompressionMethod <- Some DecompressionMethods.GZip

Another syntax to create browser

1: 
let b = browser (fun c -> { c with UserAgent=FakeUserAgent.InternetExplorer8 })

Simply download a raw content as text

1: 
2: 
3: 
4: 
let text1 =
    async {
        return! b.DownloadString(Uri "http://www.youscribe.com/")
    } |> Async.RunSynchronously

text1 value is:

"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://ogp.me/ns/fb#">
<head><scrip ... [truncated]"

end ...

Simulate a form submit and parsing result

In this example, you can save the browser state after each page download. It can be usefull if you plan to parallelize your scraping on multiples machines, just pause a job, or if you are implementing a saga.

In the following example, we search .Net books on Youscribe

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
open FSharp.Data
open ScrapyFSharp.CssSelectorExtensions

// small hack because HtmlNode.HtmlText is internal
let nodeText (n:HtmlNode) = 
    n.DescendantsAndSelf() 
    |> Seq.tryFind (fun c -> c.Name() |> String.IsNullOrEmpty)
    |> function
       | Some e -> e.InnerText()
       | None -> n.InnerText()

let books =
    async {
        let! state1 = b.NavigateTo(Uri "http://www.youscribe.com/Search", 
                        Get, HttpRequestData.FormData ["quick_search", ".net"; "theme_id", "99"])
        let homePage = state1.WebPage()
        return 
            match homePage.Html() with 
            | Some html ->
                [ for div in html.CssSelect "div.explore-item.explore-doc .document-infos" do
                    yield nodeText div ]
            | None -> List.empty
    } |> Async.RunSynchronously

books value is:

[" .NET "; " .NET Gotchas "; " Programming .NET Security ";
 " Programming .NET Windows Applications "; " ADO.NET 3.5 Cookbook ";
 " VB.NET Language Pocket Reference "; " Learning Visual Basic .NET ";
 " Programming ASP.NET "; " Getting Started with .NET Gadgeteer ";
 " ASP.NET 2.0 Cookbook "; " ADO.NET in a Nutshell ";
 " Programming ASP.NET AJAX "; " C# et .NET ";
 " Application de getion de commande en asp.net ";
 " Programming .NET Components ";
 " RabbitMQ guide utilisateur du client .NET version 1.5.0 ";
 " Programming ASP.NET MVC 4 "; " C# et .NET Version 2 ";
 " Programming .NET 3.5 "; " Learning ASP.NET 2.0 with AJAX ";
 " The ASP.NET 2.0 Anthology "; " ASP.NET 2.0: A Developer";
 " Programming ASP.NET 3.5 "; " Build Your Own ASP.NET 4 Web Site Using C# "]
namespace System
namespace System.Net
namespace ScrapyFSharp
module Network

from ScrapyFSharp
val b1 : ScrapingBrowser

Full name: NetworkExample.b1
Multiple items
type ScrapingBrowser =
  new : unit -> ScrapingBrowser
  member private CreateRequest : url:Uri -> verb:HttpVerb -> HttpWebRequest
  member DownloadFile : url:Uri -> Async<Stream>
  member DownloadString : url:Uri -> Async<string>
  member private ExecuteRequest : url:Uri * verb:HttpVerb * data:HttpRequestData option * iteration:int -> Async<BrowserState>
  member private GetResponse : url:Uri -> request:HttpWebRequest -> iteration:int -> requestBody:byte array option -> Async<BrowserState>
  member GetWebResponse : url:Uri -> request:HttpWebRequest -> Async<HttpWebResponse>
  member NavigateTo : url:Uri * ?verb:HttpVerb * ?data:HttpRequestData -> Async<BrowserState>
  member SetCookies : cookieUrl:Uri -> exp:string -> unit
  member AllowAutoRedirect : bool
  ...

Full name: ScrapyFSharp.Network.ScrapingBrowser

--------------------
new : unit -> ScrapingBrowser
property ScrapingBrowser.UserAgent: FakeUserAgent
type FakeUserAgent =
  {Name: string;
   UserAgent: string;}
  static member Chrome : FakeUserAgent
  static member Chrome24 : FakeUserAgent
  static member InternetExplorer8 : FakeUserAgent

Full name: ScrapyFSharp.Network.FakeUserAgent
property FakeUserAgent.Chrome: FakeUserAgent
property ScrapingBrowser.AllowAutoRedirect: bool
property ScrapingBrowser.AllowMetaRedirect: bool
property ScrapingBrowser.UseDefaultCookiesParser: bool
property ScrapingBrowser.DecompressionMethod: DecompressionMethods option
union case Option.Some: Value: 'T -> Option<'T>
type DecompressionMethods =
  | None = 0
  | GZip = 1
  | Deflate = 2

Full name: System.Net.DecompressionMethods
field DecompressionMethods.GZip = 1
val b : ScrapingBrowser

Full name: NetworkExample.b
val browser : f:(BrowserConfig -> BrowserConfig) -> ScrapingBrowser

Full name: ScrapyFSharp.Network.browser
val c : BrowserConfig
property FakeUserAgent.InternetExplorer8: FakeUserAgent
val text1 : string

Full name: NetworkExample.text1
val async : AsyncBuilder

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
member ScrapingBrowser.DownloadString : url:Uri -> Async<string>
Multiple items
type Uri =
  new : uriString:string -> Uri + 5 overloads
  member AbsolutePath : string
  member AbsoluteUri : string
  member Authority : string
  member DnsSafeHost : string
  member Equals : comparand:obj -> bool
  member Fragment : string
  member GetComponents : components:UriComponents * format:UriFormat -> string
  member GetHashCode : unit -> int
  member GetLeftPart : part:UriPartial -> string
  ...

Full name: System.Uri

--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
Multiple items
type Async
static member AsBeginEnd : computation:('Arg -> Async<'T>) -> ('Arg * AsyncCallback * obj -> IAsyncResult) * (IAsyncResult -> 'T) * (IAsyncResult -> unit)
static member AwaitEvent : event:IEvent<'Del,'T> * ?cancelAction:(unit -> unit) -> Async<'T> (requires delegate and 'Del :> Delegate)
static member AwaitIAsyncResult : iar:IAsyncResult * ?millisecondsTimeout:int -> Async<bool>
static member AwaitTask : task:Task -> Async<unit>
static member AwaitTask : task:Task<'T> -> Async<'T>
static member AwaitWaitHandle : waitHandle:WaitHandle * ?millisecondsTimeout:int -> Async<bool>
static member CancelDefaultToken : unit -> unit
static member Catch : computation:Async<'T> -> Async<Choice<'T,exn>>
static member FromBeginEnd : beginAction:(AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg:'Arg1 * beginAction:('Arg1 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * beginAction:('Arg1 * 'Arg2 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * arg3:'Arg3 * beginAction:('Arg1 * 'Arg2 * 'Arg3 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromContinuations : callback:(('T -> unit) * (exn -> unit) * (OperationCanceledException -> unit) -> unit) -> Async<'T>
static member Ignore : computation:Async<'T> -> Async<unit>
static member OnCancel : interruption:(unit -> unit) -> Async<IDisposable>
static member Parallel : computations:seq<Async<'T>> -> Async<'T []>
static member RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:CancellationToken -> 'T
static member Sleep : millisecondsDueTime:int -> Async<unit>
static member Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions * ?cancellationToken:CancellationToken -> Task<'T>
static member StartChild : computation:Async<'T> * ?millisecondsTimeout:int -> Async<Async<'T>>
static member StartChildAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions -> Async<Task<'T>>
static member StartImmediate : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartWithContinuations : computation:Async<'T> * continuation:('T -> unit) * exceptionContinuation:(exn -> unit) * cancellationContinuation:(OperationCanceledException -> unit) * ?cancellationToken:CancellationToken -> unit
static member SwitchToContext : syncContext:SynchronizationContext -> Async<unit>
static member SwitchToNewThread : unit -> Async<unit>
static member SwitchToThreadPool : unit -> Async<unit>
static member TryCancelled : computation:Async<'T> * compensation:(OperationCanceledException -> unit) -> Async<'T>
static member CancellationToken : Async<CancellationToken>
static member DefaultCancellationToken : CancellationToken

Full name: Microsoft.FSharp.Control.Async

--------------------
type Async<'T>

Full name: Microsoft.FSharp.Control.Async<_>
static member Async.RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:Threading.CancellationToken -> 'T
val text1Truncated : string

Full name: NetworkExample.text1Truncated
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
Multiple items
namespace FSharp

--------------------
namespace Microsoft.FSharp
Multiple items
namespace FSharp.Data

--------------------
namespace Microsoft.FSharp.Data
module CssSelectorExtensions

from ScrapyFSharp
val nodeText : n:HtmlNode -> string

Full name: NetworkExample.nodeText
val n : HtmlNode
Multiple items
module HtmlNode

from FSharp.Data

--------------------
type HtmlNode =
  private | HtmlElement of name: string * attributes: HtmlAttribute list * elements: HtmlNode list
          | HtmlText of content: string
          | HtmlComment of content: string
          | HtmlCData of content: string
  override ToString : unit -> string
  static member NewComment : content:string -> HtmlNode
  static member NewElement : name:string -> HtmlNode
  static member NewElement : name:string * children:seq<HtmlNode> -> HtmlNode
  static member NewElement : name:string * attrs:seq<string * string> -> HtmlNode
  static member NewElement : name:string * attrs:seq<string * string> * children:seq<HtmlNode> -> HtmlNode
  static member NewText : content:string -> HtmlNode
  static member Parse : text:string -> HtmlNode list
  static member ParseRooted : rootName:string * text:string -> HtmlNode

Full name: FSharp.Data.HtmlNode
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string * recurseOnMatch:bool -> seq<HtmlNode>
module Seq

from Microsoft.FSharp.Collections
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option

Full name: Microsoft.FSharp.Collections.Seq.tryFind
val c : HtmlNode
static member HtmlNodeExtensions.Name : n:HtmlNode -> string
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
String.IsNullOrEmpty(value: string) : bool
val e : HtmlNode
static member HtmlNodeExtensions.InnerText : n:HtmlNode -> string
union case Option.None: Option<'T>
val books : string list

Full name: NetworkExample.books
val state1 : BrowserState
member ScrapingBrowser.NavigateTo : url:Uri * ?verb:HttpVerb * ?data:HttpRequestData -> Async<BrowserState>
union case HttpVerb.Get: HttpVerb
type HttpRequestData =
  | Text of string
  | Buffer of byte array
  | ReadableData of Stream
  | FormData of (string * string) list
  member ToRawParams : unit -> string
  static member FromBytes : b:byte array -> HttpRequestData
  static member FromFormData : f:(string * string) list -> HttpRequestData
  static member FromStream : s:Stream -> HttpRequestData
  static member FromString : s:string -> HttpRequestData

Full name: ScrapyFSharp.Network.HttpRequestData
union case HttpRequestData.FormData: (string * string) list -> HttpRequestData
val homePage : WebPage
member BrowserState.WebPage : ?autoDetectCharsetEncoding:bool -> WebPage
member WebPage.Html : unit -> HtmlDocument option
val html : HtmlDocument
val div : HtmlNode
static member CssSelectorExtensions.CssSelect : doc:HtmlDocument * selector:string -> HtmlNode list


 Gets descendants matched by Css selector
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member GetSlice : startIndex:int option * endIndex:int option -> 'T list
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val empty<'T> : 'T list

Full name: Microsoft.FSharp.Collections.List.empty
Fork me on GitHub