Simulate a browser
This module help you to navigate on a website. Cookies, Referer, etc are automatically handled.
Openning namespaces
1: 2: 3: |
|
Creating a browser
1:
|
|
Some fake user agents are available
1:
|
|
Enable redirect headers hanling
1:
|
|
Enable redirects via HTML meta like 'meta http-equiv="refresh"'
1:
|
|
Sometimes, default .Net cookies parser can throw exception when you are scraping old java websites So, you can use this workaround.
1:
|
|
If you are scraping GZIP compressed web pages, you can specify the decompression method.
1:
|
|
Another syntax to create browser
1:
|
|
Simply download a raw content as text
1: 2: 3: 4: |
|
text1 value is:
|
end ...
Simulate a form submit and parsing result
In this example, you can save the browser state after each page download. It can be usefull if you plan to parallelize your scraping on multiples machines, just pause a job, or if you are implementing a saga.
In the following example, we search .Net books on Youscribe
1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: |
|
books value is:
|
namespace System
namespace System.Net
namespace ScrapyFSharp
module Network
from ScrapyFSharp
from ScrapyFSharp
val b1 : ScrapingBrowser
Full name: NetworkExample.b1
Full name: NetworkExample.b1
Multiple items
type ScrapingBrowser =
new : unit -> ScrapingBrowser
member private CreateRequest : url:Uri -> verb:HttpVerb -> HttpWebRequest
member DownloadFile : url:Uri -> Async<Stream>
member DownloadString : url:Uri -> Async<string>
member private ExecuteRequest : url:Uri * verb:HttpVerb * data:HttpRequestData option * iteration:int -> Async<BrowserState>
member private GetResponse : url:Uri -> request:HttpWebRequest -> iteration:int -> requestBody:byte array option -> Async<BrowserState>
member GetWebResponse : url:Uri -> request:HttpWebRequest -> Async<HttpWebResponse>
member NavigateTo : url:Uri * ?verb:HttpVerb * ?data:HttpRequestData -> Async<BrowserState>
member SetCookies : cookieUrl:Uri -> exp:string -> unit
member AllowAutoRedirect : bool
...
Full name: ScrapyFSharp.Network.ScrapingBrowser
--------------------
new : unit -> ScrapingBrowser
type ScrapingBrowser =
new : unit -> ScrapingBrowser
member private CreateRequest : url:Uri -> verb:HttpVerb -> HttpWebRequest
member DownloadFile : url:Uri -> Async<Stream>
member DownloadString : url:Uri -> Async<string>
member private ExecuteRequest : url:Uri * verb:HttpVerb * data:HttpRequestData option * iteration:int -> Async<BrowserState>
member private GetResponse : url:Uri -> request:HttpWebRequest -> iteration:int -> requestBody:byte array option -> Async<BrowserState>
member GetWebResponse : url:Uri -> request:HttpWebRequest -> Async<HttpWebResponse>
member NavigateTo : url:Uri * ?verb:HttpVerb * ?data:HttpRequestData -> Async<BrowserState>
member SetCookies : cookieUrl:Uri -> exp:string -> unit
member AllowAutoRedirect : bool
...
Full name: ScrapyFSharp.Network.ScrapingBrowser
--------------------
new : unit -> ScrapingBrowser
property ScrapingBrowser.UserAgent: FakeUserAgent
type FakeUserAgent =
{Name: string;
UserAgent: string;}
static member Chrome : FakeUserAgent
static member Chrome24 : FakeUserAgent
static member InternetExplorer8 : FakeUserAgent
Full name: ScrapyFSharp.Network.FakeUserAgent
{Name: string;
UserAgent: string;}
static member Chrome : FakeUserAgent
static member Chrome24 : FakeUserAgent
static member InternetExplorer8 : FakeUserAgent
Full name: ScrapyFSharp.Network.FakeUserAgent
property FakeUserAgent.Chrome: FakeUserAgent
property ScrapingBrowser.AllowAutoRedirect: bool
property ScrapingBrowser.AllowMetaRedirect: bool
property ScrapingBrowser.UseDefaultCookiesParser: bool
property ScrapingBrowser.DecompressionMethod: DecompressionMethods option
union case Option.Some: Value: 'T -> Option<'T>
type DecompressionMethods =
| None = 0
| GZip = 1
| Deflate = 2
Full name: System.Net.DecompressionMethods
| None = 0
| GZip = 1
| Deflate = 2
Full name: System.Net.DecompressionMethods
field DecompressionMethods.GZip = 1
val b : ScrapingBrowser
Full name: NetworkExample.b
Full name: NetworkExample.b
val browser : f:(BrowserConfig -> BrowserConfig) -> ScrapingBrowser
Full name: ScrapyFSharp.Network.browser
Full name: ScrapyFSharp.Network.browser
val c : BrowserConfig
property FakeUserAgent.InternetExplorer8: FakeUserAgent
val text1 : string
Full name: NetworkExample.text1
Full name: NetworkExample.text1
val async : AsyncBuilder
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
member ScrapingBrowser.DownloadString : url:Uri -> Async<string>
Multiple items
type Uri =
new : uriString:string -> Uri + 5 overloads
member AbsolutePath : string
member AbsoluteUri : string
member Authority : string
member DnsSafeHost : string
member Equals : comparand:obj -> bool
member Fragment : string
member GetComponents : components:UriComponents * format:UriFormat -> string
member GetHashCode : unit -> int
member GetLeftPart : part:UriPartial -> string
...
Full name: System.Uri
--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
type Uri =
new : uriString:string -> Uri + 5 overloads
member AbsolutePath : string
member AbsoluteUri : string
member Authority : string
member DnsSafeHost : string
member Equals : comparand:obj -> bool
member Fragment : string
member GetComponents : components:UriComponents * format:UriFormat -> string
member GetHashCode : unit -> int
member GetLeftPart : part:UriPartial -> string
...
Full name: System.Uri
--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
Multiple items
type Async
static member AsBeginEnd : computation:('Arg -> Async<'T>) -> ('Arg * AsyncCallback * obj -> IAsyncResult) * (IAsyncResult -> 'T) * (IAsyncResult -> unit)
static member AwaitEvent : event:IEvent<'Del,'T> * ?cancelAction:(unit -> unit) -> Async<'T> (requires delegate and 'Del :> Delegate)
static member AwaitIAsyncResult : iar:IAsyncResult * ?millisecondsTimeout:int -> Async<bool>
static member AwaitTask : task:Task -> Async<unit>
static member AwaitTask : task:Task<'T> -> Async<'T>
static member AwaitWaitHandle : waitHandle:WaitHandle * ?millisecondsTimeout:int -> Async<bool>
static member CancelDefaultToken : unit -> unit
static member Catch : computation:Async<'T> -> Async<Choice<'T,exn>>
static member FromBeginEnd : beginAction:(AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg:'Arg1 * beginAction:('Arg1 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * beginAction:('Arg1 * 'Arg2 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * arg3:'Arg3 * beginAction:('Arg1 * 'Arg2 * 'Arg3 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromContinuations : callback:(('T -> unit) * (exn -> unit) * (OperationCanceledException -> unit) -> unit) -> Async<'T>
static member Ignore : computation:Async<'T> -> Async<unit>
static member OnCancel : interruption:(unit -> unit) -> Async<IDisposable>
static member Parallel : computations:seq<Async<'T>> -> Async<'T []>
static member RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:CancellationToken -> 'T
static member Sleep : millisecondsDueTime:int -> Async<unit>
static member Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions * ?cancellationToken:CancellationToken -> Task<'T>
static member StartChild : computation:Async<'T> * ?millisecondsTimeout:int -> Async<Async<'T>>
static member StartChildAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions -> Async<Task<'T>>
static member StartImmediate : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartWithContinuations : computation:Async<'T> * continuation:('T -> unit) * exceptionContinuation:(exn -> unit) * cancellationContinuation:(OperationCanceledException -> unit) * ?cancellationToken:CancellationToken -> unit
static member SwitchToContext : syncContext:SynchronizationContext -> Async<unit>
static member SwitchToNewThread : unit -> Async<unit>
static member SwitchToThreadPool : unit -> Async<unit>
static member TryCancelled : computation:Async<'T> * compensation:(OperationCanceledException -> unit) -> Async<'T>
static member CancellationToken : Async<CancellationToken>
static member DefaultCancellationToken : CancellationToken
Full name: Microsoft.FSharp.Control.Async
--------------------
type Async<'T>
Full name: Microsoft.FSharp.Control.Async<_>
type Async
static member AsBeginEnd : computation:('Arg -> Async<'T>) -> ('Arg * AsyncCallback * obj -> IAsyncResult) * (IAsyncResult -> 'T) * (IAsyncResult -> unit)
static member AwaitEvent : event:IEvent<'Del,'T> * ?cancelAction:(unit -> unit) -> Async<'T> (requires delegate and 'Del :> Delegate)
static member AwaitIAsyncResult : iar:IAsyncResult * ?millisecondsTimeout:int -> Async<bool>
static member AwaitTask : task:Task -> Async<unit>
static member AwaitTask : task:Task<'T> -> Async<'T>
static member AwaitWaitHandle : waitHandle:WaitHandle * ?millisecondsTimeout:int -> Async<bool>
static member CancelDefaultToken : unit -> unit
static member Catch : computation:Async<'T> -> Async<Choice<'T,exn>>
static member FromBeginEnd : beginAction:(AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg:'Arg1 * beginAction:('Arg1 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * beginAction:('Arg1 * 'Arg2 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * arg3:'Arg3 * beginAction:('Arg1 * 'Arg2 * 'Arg3 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromContinuations : callback:(('T -> unit) * (exn -> unit) * (OperationCanceledException -> unit) -> unit) -> Async<'T>
static member Ignore : computation:Async<'T> -> Async<unit>
static member OnCancel : interruption:(unit -> unit) -> Async<IDisposable>
static member Parallel : computations:seq<Async<'T>> -> Async<'T []>
static member RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:CancellationToken -> 'T
static member Sleep : millisecondsDueTime:int -> Async<unit>
static member Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions * ?cancellationToken:CancellationToken -> Task<'T>
static member StartChild : computation:Async<'T> * ?millisecondsTimeout:int -> Async<Async<'T>>
static member StartChildAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions -> Async<Task<'T>>
static member StartImmediate : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartWithContinuations : computation:Async<'T> * continuation:('T -> unit) * exceptionContinuation:(exn -> unit) * cancellationContinuation:(OperationCanceledException -> unit) * ?cancellationToken:CancellationToken -> unit
static member SwitchToContext : syncContext:SynchronizationContext -> Async<unit>
static member SwitchToNewThread : unit -> Async<unit>
static member SwitchToThreadPool : unit -> Async<unit>
static member TryCancelled : computation:Async<'T> * compensation:(OperationCanceledException -> unit) -> Async<'T>
static member CancellationToken : Async<CancellationToken>
static member DefaultCancellationToken : CancellationToken
Full name: Microsoft.FSharp.Control.Async
--------------------
type Async<'T>
Full name: Microsoft.FSharp.Control.Async<_>
static member Async.RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:Threading.CancellationToken -> 'T
val text1Truncated : string
Full name: NetworkExample.text1Truncated
Full name: NetworkExample.text1Truncated
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
String.Substring(startIndex: int, length: int) : string
Multiple items
namespace FSharp
--------------------
namespace Microsoft.FSharp
namespace FSharp
--------------------
namespace Microsoft.FSharp
Multiple items
namespace FSharp.Data
--------------------
namespace Microsoft.FSharp.Data
namespace FSharp.Data
--------------------
namespace Microsoft.FSharp.Data
module CssSelectorExtensions
from ScrapyFSharp
from ScrapyFSharp
val nodeText : n:HtmlNode -> string
Full name: NetworkExample.nodeText
Full name: NetworkExample.nodeText
val n : HtmlNode
Multiple items
module HtmlNode
from FSharp.Data
--------------------
type HtmlNode =
private | HtmlElement of name: string * attributes: HtmlAttribute list * elements: HtmlNode list
| HtmlText of content: string
| HtmlComment of content: string
| HtmlCData of content: string
override ToString : unit -> string
static member NewComment : content:string -> HtmlNode
static member NewElement : name:string -> HtmlNode
static member NewElement : name:string * children:seq<HtmlNode> -> HtmlNode
static member NewElement : name:string * attrs:seq<string * string> -> HtmlNode
static member NewElement : name:string * attrs:seq<string * string> * children:seq<HtmlNode> -> HtmlNode
static member NewText : content:string -> HtmlNode
static member Parse : text:string -> HtmlNode list
static member ParseRooted : rootName:string * text:string -> HtmlNode
Full name: FSharp.Data.HtmlNode
module HtmlNode
from FSharp.Data
--------------------
type HtmlNode =
private | HtmlElement of name: string * attributes: HtmlAttribute list * elements: HtmlNode list
| HtmlText of content: string
| HtmlComment of content: string
| HtmlCData of content: string
override ToString : unit -> string
static member NewComment : content:string -> HtmlNode
static member NewElement : name:string -> HtmlNode
static member NewElement : name:string * children:seq<HtmlNode> -> HtmlNode
static member NewElement : name:string * attrs:seq<string * string> -> HtmlNode
static member NewElement : name:string * attrs:seq<string * string> * children:seq<HtmlNode> -> HtmlNode
static member NewText : content:string -> HtmlNode
static member Parse : text:string -> HtmlNode list
static member ParseRooted : rootName:string * text:string -> HtmlNode
Full name: FSharp.Data.HtmlNode
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * predicate:(HtmlNode -> bool) * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * names:seq<string> * recurseOnMatch:bool -> seq<HtmlNode>
static member HtmlNodeExtensions.DescendantsAndSelf : n:HtmlNode * name:string * recurseOnMatch:bool -> seq<HtmlNode>
module Seq
from Microsoft.FSharp.Collections
from Microsoft.FSharp.Collections
val tryFind : predicate:('T -> bool) -> source:seq<'T> -> 'T option
Full name: Microsoft.FSharp.Collections.Seq.tryFind
Full name: Microsoft.FSharp.Collections.Seq.tryFind
val c : HtmlNode
static member HtmlNodeExtensions.Name : n:HtmlNode -> string
Multiple items
type String =
new : value:char -> string + 7 overloads
member Chars : int -> char
member Clone : unit -> obj
member CompareTo : value:obj -> int + 1 overload
member Contains : value:string -> bool
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EndsWith : value:string -> bool + 2 overloads
member Equals : obj:obj -> bool + 2 overloads
member GetEnumerator : unit -> CharEnumerator
member GetHashCode : unit -> int
...
Full name: System.String
--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
type String =
new : value:char -> string + 7 overloads
member Chars : int -> char
member Clone : unit -> obj
member CompareTo : value:obj -> int + 1 overload
member Contains : value:string -> bool
member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
member EndsWith : value:string -> bool + 2 overloads
member Equals : obj:obj -> bool + 2 overloads
member GetEnumerator : unit -> CharEnumerator
member GetHashCode : unit -> int
...
Full name: System.String
--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
String.IsNullOrEmpty(value: string) : bool
val e : HtmlNode
static member HtmlNodeExtensions.InnerText : n:HtmlNode -> string
union case Option.None: Option<'T>
val books : string list
Full name: NetworkExample.books
Full name: NetworkExample.books
val state1 : BrowserState
member ScrapingBrowser.NavigateTo : url:Uri * ?verb:HttpVerb * ?data:HttpRequestData -> Async<BrowserState>
union case HttpVerb.Get: HttpVerb
type HttpRequestData =
| Text of string
| Buffer of byte array
| ReadableData of Stream
| FormData of (string * string) list
member ToRawParams : unit -> string
static member FromBytes : b:byte array -> HttpRequestData
static member FromFormData : f:(string * string) list -> HttpRequestData
static member FromStream : s:Stream -> HttpRequestData
static member FromString : s:string -> HttpRequestData
Full name: ScrapyFSharp.Network.HttpRequestData
| Text of string
| Buffer of byte array
| ReadableData of Stream
| FormData of (string * string) list
member ToRawParams : unit -> string
static member FromBytes : b:byte array -> HttpRequestData
static member FromFormData : f:(string * string) list -> HttpRequestData
static member FromStream : s:Stream -> HttpRequestData
static member FromString : s:string -> HttpRequestData
Full name: ScrapyFSharp.Network.HttpRequestData
union case HttpRequestData.FormData: (string * string) list -> HttpRequestData
val homePage : WebPage
member BrowserState.WebPage : ?autoDetectCharsetEncoding:bool -> WebPage
member WebPage.Html : unit -> HtmlDocument option
val html : HtmlDocument
val div : HtmlNode
static member CssSelectorExtensions.CssSelect : doc:HtmlDocument * selector:string -> HtmlNode list
Gets descendants matched by Css selector
Gets descendants matched by Css selector
Multiple items
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member GetSlice : startIndex:int option * endIndex:int option -> 'T list
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
module List
from Microsoft.FSharp.Collections
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member GetSlice : startIndex:int option * endIndex:int option -> 'T list
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val empty<'T> : 'T list
Full name: Microsoft.FSharp.Collections.List.empty
Full name: Microsoft.FSharp.Collections.List.empty