diff --git a/.gitignore b/.gitignore index b375c13..336fd7f 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,8 @@ lib/ .eslintcache # Misc -.DS_Store \ No newline at end of file +.DS_Store + +# Local datasets +csvs +old_csvs \ No newline at end of file diff --git a/README.md b/README.md index cb49995..5311f3b 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ If you are on a GHEC instance (including EMU), please head to `https://github.co DIRECTORY_OF_CSV_CONTENT=XXX ``` -3. Run the script +3. Run the script to deduplicate by username ```bash npm run start @@ -75,3 +75,15 @@ The above script should output something like: ``` You have a total of XX unique developers across your GitHub instances. ``` + +If you want to deduplicate by email (to catch users with multiple usernames), run: + +```bash +npm run email +``` + +The email deduplication script should output something like: + +``` +You have a total of XX unique developers (by email) across your GitHub instances. +``` diff --git a/package.json b/package.json index 3606f03..0193a97 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ }, "scripts": { "start": "npm run build && node lib/src/main.js", + "email": "npm run build && node lib/src/main-email-dedup.js", "clean": "rimraf coverage lib tmp", "prebuild": "npm run lint", "build": "tsc -p tsconfig.json", diff --git a/src/main-email-dedup.ts b/src/main-email-dedup.ts new file mode 100644 index 0000000..4e12dcf --- /dev/null +++ b/src/main-email-dedup.ts @@ -0,0 +1,62 @@ +import { readdirSync, readFileSync } from 'fs'; +import { extname } from 'path'; +import { parse, ParseResult } from 'papaparse'; +import * as dotenv from 'dotenv'; +dotenv.config({ path: '.env' }); + +const GLOBAL_DIRECTORY = process.env.DIRECTORY_OF_CSV_CONTENT || ''; + +// Extended type to include email field (email might be optional) +type CSVDataWithEmail = { + 'User login': string; + 'Organization / repository': string; + 'Last pushed date': string; + 'Last pushed email'?: string; +}; + +export const getFilesInDirectory = async (folder: string) => + readdirSync(folder).map((file) => file); + +export const filerByFileExtention = async (files: string[], format: string) => + files.filter((file) => extname(file).toLowerCase() === format); + +export const readMultipleFiles = async (files: string[]) => + ( + files.map((file) => + readFileSync(`${GLOBAL_DIRECTORY}${file}`, { encoding: 'utf8' }), + ) + ); + +export const convertContentToJSON = async (files: string[]) => + files.map((file) => parse(file, { header: true })); + +export const mergeFileContent = async (data: ParseResult[]) => { + const mergedData: CSVDataWithEmail[] = []; + data.forEach((file) => { + mergedData.push(...file.data); + }); + return mergedData; +}; + +export const uniqueUsersByEmail = async (data: CSVDataWithEmail[]) => + ( + data + .map((user) => user['Last pushed email']) + .filter((email): email is string => email != null && typeof email === 'string' && email.trim() !== '') // Type guard to filter out undefined, null, or empty emails + .map((email) => email.toLowerCase()) + .filter((email, index, arr) => arr.indexOf(email) === index) + ); + +const run = async () => { + const files = await getFilesInDirectory(GLOBAL_DIRECTORY); + const csvFilesFound = await filerByFileExtention(files, '.csv'); + const csvFiles = await readMultipleFiles(csvFilesFound); + const jsonFiles = await convertContentToJSON(csvFiles); + const content = await mergeFileContent(jsonFiles); + const unique = await uniqueUsersByEmail(content); + console.log( + `You have a total of ${unique.length} unique developers (by email) across your GitHub instances.`, + ); +}; + +run();