



版權(quán)說明:本文檔由用戶提供并上傳,收益歸屬內(nèi)容提供方,若內(nèi)容存在侵權(quán),請(qǐng)進(jìn)行舉報(bào)或認(rèn)領(lǐng)
文檔簡(jiǎn)介
1、用 C 語言編寫一個(gè)網(wǎng)絡(luò)蜘蛛來搜索網(wǎng)上出現(xiàn)的電子郵件地址作者: zhoulifa來源:可能大家經(jīng)常要去互聯(lián)網(wǎng)上搜索特定的內(nèi)容,比如收集大量郵件地址,如果用google之類的搜索引擎是沒法實(shí)現(xiàn)這種特定功能的,所以用C 語言來寫一個(gè)吧。它的功能就是不斷去取得網(wǎng)絡(luò)上的頁面, 然后分析出網(wǎng)頁上出現(xiàn)的郵件地址保存下來。 象個(gè)蜘蛛一樣, 從網(wǎng)絡(luò)上一個(gè)網(wǎng)頁爬向另一個(gè)網(wǎng)頁,不停止地搜索郵件地址。即:分析程序運(yùn)行時(shí)的參數(shù), 把各網(wǎng)頁地址作為根節(jié)點(diǎn)加入到鏈表, 然后從鏈表頭開始處理各節(jié)點(diǎn)對(duì)整個(gè)鏈表的處理是先處理兄弟節(jié)點(diǎn),流程圖如下:然后再處理各節(jié)點(diǎn)的子節(jié)點(diǎn),流程圖如下:當(dāng)然,這里采用了遞歸調(diào)用方法,處理子節(jié)點(diǎn)的數(shù)
2、據(jù)時(shí)和處理整個(gè)鏈表一樣循環(huán)處理就是了。/*關(guān)于本文檔 *filename:用 C 語言編寫一個(gè)網(wǎng)絡(luò)蜘蛛來搜索網(wǎng)上出現(xiàn)的電子郵件地址*purpose: 一個(gè)郵址搜索程序的雛形*wrote by)周立發(fā)愛好者Linux 知識(shí)傳播者SOHO 族 開發(fā)者最擅長 C 語言*date time:2006-08-31 21:00:00*Note:任何人可以任意復(fù)制代碼并運(yùn)用這些文檔,當(dāng)然包括你的商業(yè)用途* 但請(qǐng)遵循 GPL*Hope: 希望越來越多的人貢獻(xiàn)自己的力量,為科學(xué)技術(shù)發(fā)展出力*/程序在運(yùn)行的過程中要建立一個(gè)樹形鏈表結(jié)構(gòu),結(jié)構(gòu)圖如下:程序啟動(dòng)時(shí)分析所帶參數(shù), 把各參數(shù)加入到根網(wǎng)頁節(jié)點(diǎn), 如果有多個(gè)參
3、數(shù)則這個(gè)根網(wǎng)頁有兄弟節(jié)點(diǎn)。然后從根節(jié)點(diǎn)開始處理這一級(jí)上各節(jié)點(diǎn), 把各節(jié)點(diǎn)網(wǎng)頁上出現(xiàn)的網(wǎng)頁鏈接加到該節(jié)點(diǎn)的子節(jié)點(diǎn)上,處理完當(dāng)前這一級(jí)后處理子節(jié)點(diǎn)這一級(jí)。當(dāng)然這只是一個(gè)原理展示程序,并沒有進(jìn)行優(yōu)化。這個(gè)程序的main 函數(shù)流程圖如下:源代碼如下:#include <sys/>#include <sys/>#include <>#include <sys/>#include <>#include <>#include <>#include <>#include <>#include <&
4、gt;#include <>#define ACCEPT "*/*"#define ACCEPTLANGUAGE "zh-cn,zh;q="#define ACCEPTENCODING "gzip,deflate"#define ACCEPTCHARSET "gb2312,utf-8;q=,*;q="#define KEEPALIVE "300"#define CONNECTION "keep-alive"#define CONTENTTYPE "app
5、lication/x-www-form-urlencoded"#define MAXFILENAME 14#define DEBUG 1typedef struct webnode char * host;/*網(wǎng)頁所在的主機(jī)*/int port;/*網(wǎng)絡(luò)服務(wù)器所使用的端口*/char * dir;/*網(wǎng)頁所在的目錄*/char * page;/*網(wǎng)頁文件名*/char * file;/*本地保存的文件名*/char IsHandled;/*是否處理過*/struct webnode * brother;/* 兄弟節(jié)點(diǎn)鏈表指針*/struct webnode * child;/* 子節(jié)
6、點(diǎn)鏈表指針*/ WEBNODE;struct sockaddr_in server_addr;int sockfd = 0, dsend = 0, totalsend = 0, nbytes = 0, reqn = 0, i = 0, j = 0, ret = 0; struct hostent *host;char request409600 = "", buffer1024 = "", httpheader1024 = "" int FileNumber = 0;char e2 = "/"WEBNODE * N
7、odeHeader, * NodeTail, * NodeCurr;char * mapped_mem;int GetHost(char * , char * , char * , int * , char * ); /*/void AnalyzePage(WEBNODE *); /*/void AddInitNode(char *, char *, int, char * ); /*/void HandleInitNode(WEBNODE *); /*/void DisplayNode(WEBNODE *); /*/void HandOneNode(WEBNODE *); /*/void D
8、oneWithList(int); /*/void DoOnce(); /*/void ConnectWeb(void); /*/void SendRequest(void); /*/void ReceiveResponse(void); /*/void GetEmail(char * ); /*/void GetLink(char * ); /*/void GetBeforePos(char * , char * ); /*/void GetAfterPos(char * , char * ); /*/void AddChildNode(WEBNODE * , char * ); /*/vo
9、id GetAfterPosWithSlash(char * , char * ); /*/void GetMemory(char * , int ); /*/int IsExistWeb(WEBNODE * , char * , char * , int , char * ); /*/ void Rstrchr(char * , int , char * ); /*/int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptChar
10、set, char * KeepAlive, char * Connection, char * ContentType); /*/*功能:設(shè)置HTTP 協(xié)議頭內(nèi)容的一些固定值*/int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType)memcpy(UserAgent, USERAGENT, strlen
11、(USERAGENT);memcpy(Accept, ACCEPT, strlen(ACCEPT);memcpy(AcceptLanguage, ACCEPTLANGUAGE, strlen(ACCEPTLANGUAGE); memcpy(AcceptEncoding, ACCEPTENCODING , strlen(ACCEPTENCODING); memcpy(AcceptCharset, ACCEPTCHARSET, strlen(ACCEPTCHARSET); memcpy(KeepAlive, KEEPALIVE, strlen(KEEPALIVE); memcpy(Connecti
12、on, CONNECTION, strlen(CONNECTION); memcpy(ContentType, CONTENTTYPE, strlen(CONTENTTYPE); return 0;/*功能:在字符串s 里搜索x 字符,并設(shè)置指針d 指向該位置*/void Rstrchr(char * s, int x, char * d)int len = strlen(s) - 1;while(len >= 0)if(x = slen) (*d) = s + len; return;len-;(*d) = 0;/*功能:連接一個(gè)網(wǎng)站服務(wù)器*/void ConnectWeb(void)
13、 /* connect to web server */* create a socket descriptor */if(sockfd=socket(PF_INET,SOCK_STREAM,0)=-1)fprintf(stderr," Socket Error:%sa ",strerror(errno);exit(1);/* bind address */bzero(&server_addr, sizeof(server_addr);= AF_INET;= htons(NodeCurr->port);= *(struct in_addr *)host->
14、;h_addr);/* connect to the server */if(connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr) = -1)fprintf(stderr, " Connect Error:%sa ", strerror(errno);exit(1);/*功能:向網(wǎng)站發(fā)送HTTP 請(qǐng)求*/void SendRequest(void) /* send my http-request to web server */ dsend = 0;totalsend =
15、 0;nbytes=strlen(request);while(totalsend < nbytes) dsend = write(sockfd, request + totalsend, nbytes - totalsend); if(dsend=-1) fprintf(stderr, " send error!%s ", strerror(errno);exit(0); totalsend+=dsend;fprintf(stdout, " Request.%d %d bytes send OK! ", reqn, totalsend);/*功能
16、:接收網(wǎng)站的HTTP 返回*/void ReceiveResponse(void) /* get response from web server */ fd_set writefds;struct timeval tival;int retry = 0;FILE * localfp = NULL;i=0; j = 0;_ReCeive:FD_ZERO(&writefds);= 10;= 0;if(sockfd > 0) FD_SET(sockfd, &writefds);else fprintf(stderr, " Error, socket is negat
17、ive! "); exit(0);ret = select(sockfd + 1, &writefds, NULL, NULL, &tival);if(ret =0 ) if(retry+ < 10) goto _ReCeive;if(ret <= 0) fprintf(stderr, " Error while receiving! "); exit(0);if(FD_ISSET(sockfd, &writefds) memset(buffer, 0, 1024); memset(httpheader, 0, 1024);if
18、(localfp = fopen(NodeCurr->file, "w") = NULL) if(DEBUG) fprintf(stderr, "create file '%s' error ", NodeCurr->file); return;/* receive data from web server */while(nbytes=read(sockfd,buffer,1)=1)if(i < 4) /*獲取HTTP 消息頭*/if(buffer0 = ' ' | buffer0 = '
19、39;)i+;else i = 0;memcpy(httpheader + j, buffer, 1); j+;else /*獲取HTTP 消息體*/fprintf(localfp, "%c", buffer0); /* print content on the screen */ d is: %s", +reqn, request);DoOnce();if(flag) fprintf(stdout, " The following is the response header: %s", httpheader);/*功能:從字符串src 中分
20、析出網(wǎng)站地址和端口,并得到文件和目錄*/int GetHost(char * src, char * web, char * file, int * port, char * dir) char * pA, * pB, * pC;int len;*port = 0;if(!(*src)return -1;pA = src;if(!strncmp(pA, "", strlen("")pA = src+strlen("");/* else if(!strncmp(pA, "", strlen("")
21、pA = src+strlen(""); */else return 1;pB = strchr(pA, '/');if(pB)len = strlen(pA) - strlen(pB);GetMemory(web, len);memcpy(*web), pA, len);if(*(pB+1)Rstrchr(pB + 1, '/', &pC);if(pC) len = strlen(pB + 1) - strlen(pC);else len = 0;if(len > 0) GetMemory(dir, len);memcpy(*
22、dir), pB + 1, len);if(pC + 1) len = strlen(pC + 1);GetMemory(file, len);memcpy(*file), pC + 1, len);else len = 1;GetMemory(file, len);memcpy(*file), e, len);else len = 1;GetMemory(dir, len);memcpy(*dir), e + 1, len);len = strlen(pB + 1);GetMemory(file, len);memcpy(*file), pB + 1, len);else len = 1;G
23、etMemory(dir, len);memcpy(*dir), e + 1, len);len = 1;GetMemory(file, len);memcpy(*file), e, len);elselen = strlen(pA);GetMemory(web, len);memcpy(*web), pA, strlen(pA);len = 1;GetMemory(dir, len);memcpy(*dir), e + 1, len);len = 1;GetMemory(file, len);memcpy(*file), e, len);pA = strchr(*web), ':
24、39;);if(pA)*port = atoi(pA + 1);else *port = 80;return 0;/*filename:*purpose: 用 C 語言編寫一個(gè)網(wǎng)絡(luò)蜘蛛來搜索網(wǎng)上出現(xiàn)的電子郵件地址*tidied by)周立發(fā)愛好者Linux 知識(shí)傳播者SOHO 族 開發(fā)者最擅長 C 語言*date time:2006-08-31 21:00:00*Note:任何人可以任意復(fù)制代碼并運(yùn)用這些文檔,當(dāng)然包括你的商業(yè)用途* 但請(qǐng)遵循 GPL*Thanks to: 廣東省 Linux 公共服務(wù)技術(shù)支持中心 */int main(int argc, char * argv)int Web
25、Port;char * WebHost = 0, * PageAddress = 0, * WebDir = 0;if(argc < 2) if(DEBUG) fprintf(stdout, "Command error, you should input like this: %s WebPageAddress1 WebPageAddress2 WebPageAddress3 .", argv0); exit(0);NodeHeader = NodeTail = NodeCurr = 0;5d:", FileNumber);DisplayNode(Node
26、Header); /* display every node */HandleInitNode(NodeHeader); /* handle every page */return 0;/*功能:分析網(wǎng)頁*/void AnalyzePage(WEBNODE * node)int fd;int flength = 0;fd = open(node->file, O_RDONL Y);if(fd = -1)goto _AnalyzeDone;flength = lseek(fd, 1, SEEK_END);write(fd, "0", 1);lseek(fd, 0, SE
27、EK_SET);mapped_mem = mmap(0, flength, PROT_READ, MAP_PRIVA TE, fd, 0); GetEmail(mapped_mem);GetLink(mapped_mem);close(fd);munmap(mapped_mem, flength);_AnalyzeDone:close(fd);node->IsHandled = 1;remove(node->file);/*功能:為根節(jié)點(diǎn)設(shè)置兄弟節(jié)點(diǎn)*/void AddInitNode(char * Host, char * Page, int Port, char * Dir)W
28、EBNODE * NewNode;char filenameMAXFILENAME + 1 = ""if(NodeHeader = NULL) NewNode = NodeHeader = (WEBNODE *)malloc(sizeof(WEBNODE);else NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE); memset(NewNode, 0, sizeof(WEBNODE);NewNode->host = (char *)malloc(strlen(Host) + 1);m
29、emset(NewNode->host, 0, strlen(Host) + 1);NewNode->page = (char *)malloc(strlen(Page) + 1);memset(NewNode->page, 0, strlen(Page) + 1);NewNode->dir = (char *)malloc(strlen(Dir) + 1);memset(NewNode->dir, 0, strlen(Dir) + 1);NewNode->file = (char *)malloc(MAXFILENAME + 1);memset(NewNo
30、de->file, 0, MAXFILENAME + 1);strcpy(NewNode->host, Host);strcpy(NewNode->page, Page);strcpy(NewNode->dir, Dir);sprintf(filename, "file%", FileNumber+);strcpy(NewNode->file, filename);NewNode->port = Port;NewNode->IsHandled = 0;NewNode->brother = 0;NewNode->child
31、 = 0;NodeTail = NewNode;/*功能:處理根節(jié)點(diǎn)信息*/void HandleInitNode(WEBNODE * node)WEBNODE * CurrentNode = 0;CurrentNode = node;if(CurrentNode)while(CurrentNode)if(CurrentNode->IsHandled = 0)HandOneNode(CurrentNode);if(DEBUG)fprintf(stdout, " Display.%5d:", FileNumber);DisplayNode(NodeHeader);/*d
32、isplayeverynode */CurrentNode = CurrentNode->brother;CurrentNode = node;while(CurrentNode)if(CurrentNode->child&&CurrentNode->child->IsHandled= 0)HandleInitNode(CurrentNode->child);CurrentNode = CurrentNode->brother;/*功能:顯示年有節(jié)點(diǎn)信息*/void DisplayNode(WEBNODE * NodeHeader)WEBNO
33、DE * TempNode;TempNode = NodeHeader;fprintf(stdout, " ");while(TempNode) if(!strcmp(TempNode->dir, "/") fprintf(stdout, " %s:%d%s%s => %s %d ", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page,"")?TempNode->page:"&
34、quot;, TempNode->file, TempNode->IsHandled);else fprintf(stdout, " %s:%d/%s/%s => %s %d ", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "")?TempNode->page:"", TempNode->file, TempNode->IsHandled);TempNode = TempNode
35、->brother;TempNode = NodeHeader;while(TempNode) if(TempNode->child) DisplayNode(TempNode->child); TempNode = TempNode->brother;/*功能:處理單個(gè)節(jié)點(diǎn)信息*/void HandOneNode(WEBNODE * node)char UserAgent1024 = "", Accept1024 = "", AcceptLanguage1024 = "", AcceptEncoding102
36、4 = "", AcceptCharset1024 = "", KeepAlive1024 = "", Connection1024 = "", ContentType1024 = ""NodeCurr = node;if(host=gethostbyname(NodeCurr->host)=NULL) /* get ip address by domain */if(DEBUG)fprintf(stderr,"Gethostname '%s' error, %s
37、 ", NodeCurr->host,strerror(errno);exit(1);GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */if(strcmp(NodeCurr->dir, "/")sprintf(request, "GET /%s/%s HTTP/ Host: %sUser-A
38、gent:%sAccept:%sConnection:%s",NodeCurr->dir,strcmp(NodeCurr->page,"")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);elsesprintf(request, "GET %s%s HTTP/ Host: %s User-Agent: %s Accept: %sConnection:%s",NodeCurr->dir,strcmp(NodeCu
39、rr->page,"")?NodeCurr->page:"",NodeCurr->host, UserAgent, Accept, Connection);DoneWithList(1);AnalyzePage(NodeCurr);/*功能:從字符串src 中分析出郵件地址保存到文件*/void GetEmail(char * src)char * pa, * pb, * pc, *pd;char myemail1024 = ""FILE * mailfp = NULL;if(mailfp = fopen("
40、;", "a+") = NULL)return;pa = src;while(pb = strchr(pa, '')GetBeforePos(pb, &pc);GetAfterPos(pb, &pd);if(pc && pd && (strlen(pc) > (strlen(pd) + 3)memset(myemail, 0, 1024);memcpy(myemail, pc, strlen(pc) - strlen(pd);if(strcmp(NodeCurr->dir,"/&
41、quot;)fprintf(mailfp,"%s",myemail,NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "")?NodeCurr->page:"");elsefprintf(mailfp,"%s",myemail,NodeCurr->host,NodeCurr->dir, strcmp(NodeCurr->page, "")?NodeCurr->page:""
42、;);if(*(pd + 1)pa = pd + 1;else break;else if(*(pb + 1)elsebreak;pa = pb + 1;fclose(mailfp);/*功能:從src 中找出前面的字母、數(shù)字等內(nèi)含,即email 地址中 的前面部分*/void GetBeforePos(char * src, char * d)char * x;if(src - 1)x = src - 1;else *d = 0; return ;while(x)if(*x >= 'a' && *x <= 'z') x-; cont
43、inue;else if(*x >= 'A' && *x <= 'Z') x-; continue;else if(*x >= '0' && *x <= '9') x-; continue;else if(*x = '.' | *x = '-' | *x = '_') x-; continue;else break;x+;if(x) *d = x;else *d = 0;/*功能:從src 中找出后面的字母、數(shù)字等內(nèi)含,即ema
44、il 地址中 的后面部分*/void GetAfterPos(char * src, char * d)char * x;if(src + 1)x = src + 1;else *d = 0; return ;while(x)if(*x >= 'a' && *x <= 'z') x+; continue;else if(*x >= 'A' && *x <= 'Z') x+; continue;else if(*x >= '0' && *x
45、 <= '9') x+; continue;else if(*x = '.' | *x = '-' | *x = '_') x+; continue;else break;if(x) *d = x;else *d = 0;/*功能:從src 中找出前面的字母、數(shù)字等內(nèi)含,即一個(gè)網(wǎng)頁地址中主機(jī)名后面的部分*/void GetAfterPosWithSlash(char * src, char * d)char * x;if(src)x = src;else *d = 0; return ;while(x)if(*x >=
46、 'a' && *x <= 'z') x+; continue;else if(*x >= 'A' && *x <= 'Z') x+; continue;else if(*x >= '0' && *x <= '9') x+; continue;else if(*x = '.' | *x = '-' | *x = '_' | *x = '=') x+; cont
47、inue; else if(*x = ':' | *x = '/' | *x = '?' | *x = '&') x+; continue; else break;if(x) *d = x;else *d = 0;/*功能:為myanchor 分配len 大小的內(nèi)存*/void GetMemory(char * myanchor, int len)if(!(*myanchor)(*myanchor) = (char *)malloc(len + 1);else(*myanchor) = (char *)realloc(vo
48、id *)(*myanchor), len + 1);memset(*myanchor), 0, len + 1);/*功能:從src 中分析出網(wǎng)頁鏈接,并加入到當(dāng)前節(jié)點(diǎn)的子節(jié)點(diǎn)上*/void GetLink(char * src)char * pa, * pb, * pc;char * myanchor = 0;int len = 0;pa = src;do if(pb = strstr(pa, "href='")pc = strchr(pb + 6, ''');len = strlen(pb + 6) - strlen(pc);GetMemory(&myanchor, len);memcpy(myanchor, pb + 6, len);else i
溫馨提示
- 1. 本站所有資源如無特殊說明,都需要本地電腦安裝OFFICE2007和PDF閱讀器。圖紙軟件為CAD,CAXA,PROE,UG,SolidWorks等.壓縮文件請(qǐng)下載最新的WinRAR軟件解壓。
- 2. 本站的文檔不包含任何第三方提供的附件圖紙等,如果需要附件,請(qǐng)聯(lián)系上傳者。文件的所有權(quán)益歸上傳用戶所有。
- 3. 本站RAR壓縮包中若帶圖紙,網(wǎng)頁內(nèi)容里面會(huì)有圖紙預(yù)覽,若沒有圖紙預(yù)覽就沒有圖紙。
- 4. 未經(jīng)權(quán)益所有人同意不得將文件中的內(nèi)容挪作商業(yè)或盈利用途。
- 5. 人人文庫網(wǎng)僅提供信息存儲(chǔ)空間,僅對(duì)用戶上傳內(nèi)容的表現(xiàn)方式做保護(hù)處理,對(duì)用戶上傳分享的文檔內(nèi)容本身不做任何修改或編輯,并不能對(duì)任何下載內(nèi)容負(fù)責(zé)。
- 6. 下載文件中如有侵權(quán)或不適當(dāng)內(nèi)容,請(qǐng)與我們聯(lián)系,我們立即糾正。
- 7. 本站不保證下載資源的準(zhǔn)確性、安全性和完整性, 同時(shí)也不承擔(dān)用戶因使用這些下載資源對(duì)自己和他人造成任何形式的傷害或損失。
最新文檔
- 汽車抵押銷售代理合同樣本
- 素描景觀考試題及答案
- 倉庫柜子改造方案(3篇)
- 舊屋安全檢測(cè)方案
- 2026版《全品高考》選考復(fù)習(xí)方案生物0401 第11講 細(xì)胞的增殖
- 美容老師培訓(xùn)課件
- 分泌性中耳炎的護(hù)理
- 2026版《全品高考》選考復(fù)習(xí)方案物理02 單元過關(guān)卷(二) 含答案
- 棗莊高考試題及答案
- 醫(yī)學(xué)影像考試題及答案
- 20120309-奇瑞KD索賠培訓(xùn)材料(new)
- 社區(qū)獲得性肺炎ppt
- 直流屏檢修作業(yè)指導(dǎo)書(word文檔)
- YY/T 1293.2-2022接觸性創(chuàng)面敷料第2部分:聚氨酯泡沫敷料
- GB/T 19404-2003微波鐵氧體器件主要性能測(cè)量方法
- GB/T 18418-2017家用衛(wèi)生殺蟲用品電熱蚊香液
- GB/T 17456.2-2010球墨鑄鐵管外表面鋅涂層第2部分:帶終飾層的富鋅涂料涂層
- 政府用地項(xiàng)目用地報(bào)批流程
- 高校畢業(yè)生學(xué)籍檔案管理課件
- 老年人的生理變化特點(diǎn)課件
- 徐健順吟誦文集(.12.16)
評(píng)論
0/150
提交評(píng)論