从今天开始写larbin的源码分析。但愿在一个月后,本身可以对larbin的源码了如指掌,也但愿到那时本身能够写出一个爬虫。web
下面是global的结构,从global中的成员,咱们能够大概对larbin有个了解。dom
struct global {
/** Constructor : see global.cc for details */
global (int argc, char * argv[]);
/** Destructor : never used */
~global ();
/** current time : avoid to many calls to time(NULL) */
static time_t now; //当前的时间
/** List of pages allready seen (one bit per page) */
static hashTable *seen; //已经扫描过的页
#ifdef NO_DUP
/** Hashtable for suppressing duplicates */
static hashDup *hDuplicate;
#endif // NO_DUP
/** URLs for the sequencer with high priority */
static SyncFifo<url> *URLsPriority; //SyncFifo 是一个同步的先进先出的队列.点此可进入此结构的研究socket
static SyncFifo<url> *URLsPriorityWait;
static uint readPriorityWait;
/** This one has a lower priority : see fetch/sequencer.cc */
static PersistentFifo *URLsDisk; //PersisitentFifo是一个存储在硬盘上的队列。点此进入此结构
static PersistentFifo *URLsDiskWait;
static uint readWait;
/** hashtables of the site we accessed (cache) */
static NamedSite *namedSiteList; //NamedSite是存储已经访问过的网站的,每个NamedSite都对应一个IPsite。点此进入NamedSite的研究。 ide
static IPSite *IPSiteList; //点此进入ipsite的研究
/** Sites which have at least one url to fetch */
static Fifo<IPSite> *okSites; // Fifo是非同步的标准的存在于RAM中的队列。点此进入Fifo的研究
/** Sites which have at least one url to fetch
* but need a dns call
*/
static Fifo<NamedSite> *dnsSites;
/** Informations for the fetch
* This array contain all the connections (empty or not)
*/
static Connexion *connexions;
/** Internal state of adns */
static adns_state ads;
/* Number of pending dns calls */
static uint nbDnsCalls;
/** free connection for fetchOpen : connections with state==EMPTY */
static ConstantSizedFifo<Connexion> *freeConns;
#ifdef THREAD_OUTPUT
/** free connection for fetchOpen : connections waiting for end user */
static ConstantSizedFifo<Connexion> *userConns;
#endif
/** Sum of the sizes of a fifo in Sites */
static Interval *inter;
/** How deep should we go inside a site */
static int8_t depthInSite;
/** Follow external links ? */
static bool externalLinks;
/** how many seconds should we wait beetween 2 calls at the same server
* 0 if you are only on a personnal server, >=30 otherwise
*/
static time_t waitDuration;
/** Name of the bot */
static char *userAgent;
/** Name of the man who lauch the bot */
static char *sender;
/** http headers to send with requests
* sends name of the robots, from field...
*/
static char *headers;
static char *headersRobots; // used when asking a robots.txt
/* internet address of the proxy (if any) */
static sockaddr_in *proxyAddr;
/** connect to this server through a proxy using connection conn
* return >0 in case of success (connecting or connected), 0 otherwise
*/
static char getProxyFds (Connexion *conn);
/** Limit to domain */
static Vector<char> *domains;
/** forbidden extensions
* extensions which are allways to avoid : .ps, .pdf...
*/
static Vector<char> forbExt;
/** number of parallel connexions
* your kernel must support a little more than nb_conn file descriptors
*/
static uint nb_conn;
/** number of parallel dns calls */
static uint dnsConn;
/** number of urls in IPSites */
static int IPUrl;
/** port on which is launched the http statistic webserver */
static unsigned short int httpPort;
/** port on which input wait for queries */
static unsigned short int inputPort;
/** parse configuration file */
static void parseFile (char *file);
/** read the domain limit */
static void manageDomain (char **posParse);
/** read the forbidden extensions */
static void manageExt (char **posParse);
/////////// POLL ///////////////////////////////////
/** array used by poll */
static struct pollfd *pollfds;
/** pos of the max used field in pollfds */
static uint posPoll;
/** size of pollfds */
static uint sizePoll;
/** array used for dealing with answers */
static short *ansPoll;
/** number of the biggest file descriptor */
static int maxFds;
/** make sure the new socket is not too big for ansPoll */
static void verifMax (int fd);
#ifdef MAXBANDWIDTH
/** number of bits still allowed during this second */
static long int remainBand;
#endif // MAXBANDWIDTH
};源码分析