Skip to content

Commit

Permalink
Ver 3
Browse files Browse the repository at this point in the history
Ver 3
  • Loading branch information
subodhjena committed Dec 30, 2013
1 parent 07c0da3 commit a7a571f
Show file tree
Hide file tree
Showing 31 changed files with 128 additions and 33 deletions.
2 changes: 1 addition & 1 deletion App.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<abot>
<crawlBehavior
maxConcurrentThreads="20"
maxPagesToCrawl="100000000"
maxPagesToCrawl="10000000"
maxPagesToCrawlPerDomain="0"
maxPageSizeInBytes="0"
userAgentString="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v@ABOTASSEMBLYVERSION@ http://code.google.com/p/abot)"
Expand Down
19 changes: 10 additions & 9 deletions CrawlerDB.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,25 @@ namespace FileCrawler
{
public class CrawlerDB
{
public const string DB_CONN_STRING = "data source=SUBODH;initial catalog=DownloadCrawler;Integrated security=true";
public const string DB_CONN_STRING = "data source=SUBODH;initial catalog=FileCrawler;Integrated security=true";

public void SaveFileURLToDB(String hostName, string fileType, string fileUrl)
public void SaveFileURLToDB(String tableName, String hostName, string fileType, string fileDescription,string fileUrl)
{
using (SqlConnection con = new SqlConnection(DB_CONN_STRING))
{
con.Open();
try
{
using (SqlCommand command = new SqlCommand("INSERT INTO [DownloadCrawler].[dbo].[CrawledFiles] VALUES(@host, @file, @url, @credt,@crename,@upddt,@updnm)", con))
using (SqlCommand command = new SqlCommand("INSERT INTO [FileCrawler].[dbo].[" + tableName + "] VALUES(@hostName,@fileType,@fileDescription,@url,@createdDt,@createdByName,@updatedDt,@updatedByName)", con))
{
command.Parameters.Add(new SqlParameter("host", hostName));
command.Parameters.Add(new SqlParameter("file", fileType));
command.Parameters.Add(new SqlParameter("hostName", hostName));
command.Parameters.Add(new SqlParameter("fileType", fileType));
command.Parameters.Add(new SqlParameter("fileDescription", fileDescription));
command.Parameters.Add(new SqlParameter("url", fileUrl));
command.Parameters.Add(new SqlParameter("credt", DateTime.Now.ToString()));
command.Parameters.Add(new SqlParameter("crename", "Subodh"));
command.Parameters.Add(new SqlParameter("upddt", DBNull.Value));
command.Parameters.Add(new SqlParameter("updnm", DBNull.Value));
command.Parameters.Add(new SqlParameter("createdDt", DateTime.Now.ToString()));
command.Parameters.Add(new SqlParameter("createdByName", "Subodh"));
command.Parameters.Add(new SqlParameter("updatedDt", DBNull.Value));
command.Parameters.Add(new SqlParameter("updatedByName", DBNull.Value));
command.ExecuteNonQuery();
}
}
Expand Down
26 changes: 26 additions & 0 deletions FileCrawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,39 @@
</ItemGroup>
<ItemGroup>
<Compile Include="CrawlerDB.cs" />
<Compile Include="FileTypes.cs" />
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config">
<SubType>Designer</SubType>
</None>
<None Include="FileTypes\3DImageFiles.csv" />
<None Include="FileTypes\AudioFiles.csv" />
<None Include="FileTypes\BackupFiles.csv" />
<None Include="FileTypes\CADFiles.csv" />
<None Include="FileTypes\CompressedFiles.csv" />
<None Include="FileTypes\DataFiles.csv" />
<None Include="FileTypes\DatabaseFiles.csv" />
<None Include="FileTypes\DeveloperFiles.csv" />
<None Include="FileTypes\DiskImageFiles.csv" />
<None Include="FileTypes\EncodedFiles.csv" />
<None Include="FileTypes\ExecutableFiles.csv" />
<None Include="FileTypes\FontFiles.csv" />
<None Include="FileTypes\GameFiles.csv" />
<None Include="FileTypes\GISFiles.csv" />
<None Include="FileTypes\MiscFiles.csv" />
<None Include="FileTypes\PageLayoutFiles.csv" />
<None Include="FileTypes\PluginFiles.csv" />
<None Include="FileTypes\RasterImageFiles.csv" />
<None Include="FileTypes\SettingsFiles.csv" />
<None Include="FileTypes\SpreadsheetFiles.csv" />
<None Include="FileTypes\SystemFiles.csv" />
<None Include="FileTypes\TextFiles.csv" />
<None Include="FileTypes\VectorImageFiles.csv" />
<None Include="FileTypes\VideoFiles.csv" />
<None Include="FileTypes\WebFiles.csv" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include=".NETFramework,Version=v4.5">
Expand Down
Binary file added FileCrawler.sql
Binary file not shown.
42 changes: 42 additions & 0 deletions FileTypes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace FileCrawler
{
class FileTypes
{
List<String> fileList = new List<string>();
static StreamReader reader;

public List<String> GetFileTypesToDownlaod(String p)
{
string[] files = Directory.GetFiles(p);
foreach (String filePath in files)
{
ReadCSVToOne(filePath);
}
return fileList;
}

private void ReadCSVToOne(String filePath)
{
int counter = 0;
string line;

// Read the file and display it line by line.
reader = new System.IO.StreamReader(filePath);
while ((line = reader.ReadLine()) != null)
{
String fileName = Path.GetFileName(filePath).ToString().Substring(0, Path.GetFileName(filePath).ToString().Length-4);
fileList.Add(line+","+fileName);
counter++;
}

reader.Close();
}
}
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
72 changes: 49 additions & 23 deletions Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,22 @@ namespace FileCrawler
{
class Program
{
static String webURL = @"http://www.songspk.name/";
static String filterTxt = ".zip";
#region "Fields and Object Declaration"

static String webURL = @"http://ebooks.allfree-stuff.com/";
static String fileTypePath = @"C:\Users\Subodhlc\Documents\Visual Studio 2012\Projects\FileCrawler\FileCrawler\FileTypes";
static CrawlerDB crawalerDatabase = new CrawlerDB();

static FileTypes fileTyp = new FileTypes();
static List<String> filters;

#endregion


static void Main(string[] args)
{
//Will Get the FileTypes to Download
filters = fileTyp.GetFileTypesToDownlaod(fileTypePath);

//Will use app.config for confguration
PoliteWebCrawler crawler = new PoliteWebCrawler();

Expand Down Expand Up @@ -48,7 +58,7 @@ static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedA
SaveURLFail(crawledPage.Uri.AbsoluteUri.ToString());
else
SaveURLSuccess(crawledPage.Uri.AbsoluteUri.ToString());

if (string.IsNullOrEmpty(crawledPage.RawContent))
SaveURLNoContent(crawledPage.Uri.AbsoluteUri.ToString());
}
Expand All @@ -69,42 +79,58 @@ static void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e
private static void SaveURLSuccess(string p)
{
Console.WriteLine("Shubh :Crawl of page succeeded {0}", p);
if (p.Contains(filterTxt) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
{
crawalerDatabase.SaveFileURLToDB(webURL, filterTxt, p);
}
WriteToDB(p);
}
private static void SaveURLFail(string p)
{
Console.WriteLine("Shubh :Crawl of page Failed {0}", p);
if (p.Contains(filterTxt) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
{
crawalerDatabase.SaveFileURLToDB(webURL, filterTxt, p);
}
WriteToDB(p);
}
private static void SavePageLinksCrawlDisallowed(string p)
{
Console.WriteLine("Shubh :Page Links Craw lDisallowed {0}", p);
if (p.Contains(filterTxt) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
{
crawalerDatabase.SaveFileURLToDB(webURL, filterTxt, p);
}
WriteToDB(p);
}
private static void SaveURLNoContent(string p)
{
Console.WriteLine("Shubh :Crawl of page had no content {0}", p);
if (p.Contains(filterTxt) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
{
crawalerDatabase.SaveFileURLToDB(webURL, filterTxt, p);
}
WriteToDB(p);
}
private static void SavePageCrawlDisallowed(string p)
{
Console.WriteLine("Shubh :Crawl of page Not allowed {0}", p);
if (p.Contains(filterTxt) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
WriteToDB(p);
}

//DB Writes
private static void WriteToDB(string p)
{
try
{
crawalerDatabase.SaveFileURLToDB(webURL, filterTxt, p);

foreach (String filter in filters)
{
String[] splitFilters = filter.Split(',');
String filterFileType = splitFilters[0];
String filterFileDescription = splitFilters[1];
String filterFileTable = splitFilters[2];

if (p.Trim().EndsWith(filterFileType) && Uri.IsWellFormedUriString(p, UriKind.RelativeOrAbsolute))
{
crawalerDatabase.SaveFileURLToDB(filterFileTable, webURL, filterFileType, filterFileDescription, p);
Console.WriteLine("Wrote :" + p + " to :" + filterFileTable);
}
}
}
catch (Exception ex)
{
System.Console.WriteLine("**************************************");
System.Console.WriteLine("WRITING: ERROR DESCRIPTION");
System.Console.WriteLine(" Error Message:" + ex.Message);
System.Console.WriteLine(" Stack Trace :" + ex.StackTrace);
System.Console.WriteLine("**************************************");
}
}

}
}
}

0 comments on commit a7a571f

Please sign in to comment.