Skip to content

Commit 53ee492

Browse files
authored
Add line numbers support (#10)
* feat: add line numbers support * build: add cstdint header * refactor: remove variable underscore * refactor: review
1 parent c103b51 commit 53ee492

File tree

4 files changed

+169
-34
lines changed

4 files changed

+169
-34
lines changed

README.md

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ A fast C++ lexer for extracting named exports from CommonJS modules. This librar
66

77
- **Fast**: Zero-copy parsing for most exports using `std::string_view`
88
- **Accurate**: Handles complex CommonJS patterns including re-exports, Object.defineProperty, and transpiler output
9+
- **Source Locations**: Each export includes a 1-based line number for tooling integration
910
- **Unicode Support**: Properly unescapes JavaScript string literals including `\u{XXXX}` and surrogate pairs
1011
- **Optional SIMD Acceleration**: Can use [simdutf](https://github.com/simdutf/simdutf) for faster string operations
1112
- **No Dependencies**: Single-header distribution available (simdutf is optional)
@@ -49,20 +50,21 @@ int main() {
4950
if (result) {
5051
std::cout << "Exports found:" << std::endl;
5152
for (const auto& exp : result->exports) {
52-
std::cout << " - " << lexer::get_string_view(exp) << std::endl;
53+
std::cout << " - " << lexer::get_string_view(exp)
54+
<< " (line " << exp.line << ")" << std::endl;
5355
}
5456
}
55-
57+
5658
return 0;
5759
}
5860
```
5961

6062
Output:
6163
```
6264
Exports found:
63-
- foo
64-
- bar
65-
- baz
65+
- foo (line 2)
66+
- bar (line 3)
67+
- baz (line 4)
6668
```
6769

6870
## API Reference
@@ -85,11 +87,22 @@ Parses CommonJS source code and extracts export information.
8587
8688
```cpp
8789
struct lexer_analysis {
88-
std::vector<export_string> exports; // Named exports
89-
std::vector<export_string> re_exports; // Re-exported module specifiers
90+
std::vector<export_entry> exports; // Named exports
91+
std::vector<export_entry> re_exports; // Re-exported module specifiers
92+
};
93+
```
94+
95+
### `lexer::export_entry`
96+
97+
```cpp
98+
struct export_entry {
99+
export_string name;
100+
uint32_t line; // 1-based line number
90101
};
91102
```
92103
104+
Each export/re-export entry includes the name and the 1-based line number where it was found in the source.
105+
93106
### `lexer::export_string`
94107
95108
```cpp
@@ -104,9 +117,10 @@ Export names are stored as a variant to avoid unnecessary copies:
104117

105118
```cpp
106119
inline std::string_view get_string_view(const export_string& s);
120+
inline std::string_view get_string_view(const export_entry& e);
107121
```
108122
109-
Helper function to get a `string_view` from either variant type.
123+
Helper function to get a `string_view` from an `export_string` or `export_entry`.
110124
111125
### `lexer::get_last_error`
112126

include/merve/parser.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "merve/version.h"
55

6+
#include <cstdint>
67
#include <optional>
78
#include <string>
89
#include <string_view>
@@ -47,6 +48,14 @@ enum lexer_error {
4748
*/
4849
using export_string = std::variant<std::string, std::string_view>;
4950

51+
/**
52+
* @brief An export name together with its 1-based source line number.
53+
*/
54+
struct export_entry {
55+
export_string name;
56+
uint32_t line; // 1-based line number
57+
};
58+
5059
/**
5160
* @brief Result of parsing a CommonJS module.
5261
*/
@@ -61,7 +70,7 @@ struct lexer_analysis {
6170
* - module.exports = { a, b, c }
6271
* - Object.defineProperty(exports, 'name', {...})
6372
*/
64-
std::vector<export_string> exports{};
73+
std::vector<export_entry> exports{};
6574

6675
/**
6776
* @brief Module specifiers from re-export patterns.
@@ -72,7 +81,7 @@ struct lexer_analysis {
7281
* - __export(require('other'))
7382
* - Object.keys(require('other')).forEach(...)
7483
*/
75-
std::vector<export_string> re_exports{};
84+
std::vector<export_entry> re_exports{};
7685
};
7786

7887
/**
@@ -89,6 +98,13 @@ inline std::string_view get_string_view(const export_string& s) {
8998
return std::visit([](const auto& v) -> std::string_view { return v; }, s);
9099
}
91100

101+
/**
102+
* @brief Get a string_view from an export_entry (delegates to the name field).
103+
*/
104+
inline std::string_view get_string_view(const export_entry& e) {
105+
return get_string_view(e.name);
106+
}
107+
92108
/**
93109
* @brief Parse CommonJS source code and extract export information.
94110
*

src/parser.cpp

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,8 @@ class CJSLexer {
325325
uint16_t openTokenDepth;
326326
uint16_t templateDepth;
327327

328+
uint32_t line;
329+
328330
bool lastSlashWasDivision;
329331
bool nextBraceIsClass;
330332

@@ -335,8 +337,16 @@ class CJSLexer {
335337
StarExportBinding* starExportStack;
336338
const StarExportBinding* STAR_EXPORT_STACK_END;
337339

338-
std::vector<export_string>& exports;
339-
std::vector<export_string>& re_exports;
340+
std::vector<export_entry>& exports;
341+
std::vector<export_entry>& re_exports;
342+
343+
// Increments `line` when consuming a line terminator.
344+
// - Counts '\n' as a newline.
345+
// - Counts '\r' as a newline only when it is not part of a CRLF sequence.
346+
// (i.e., the next character is not '\n' or we're at end-of-input.)
347+
void countNewline(char ch) {
348+
line += (ch == '\n') || (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n'));
349+
}
340350

341351
// Character classification helpers using lookup tables
342352
static bool isBr(char c) {
@@ -495,6 +505,8 @@ class CJSLexer {
495505
return ch;
496506
} else if (!isBrOrWs(ch)) {
497507
return ch;
508+
} else {
509+
countNewline(ch);
498510
}
499511
} while (pos++ < end);
500512
return ch;
@@ -503,8 +515,10 @@ class CJSLexer {
503515
void lineComment() {
504516
while (pos++ < end) {
505517
char ch = *pos;
506-
if (ch == '\n' || ch == '\r')
518+
if (ch == '\n' || ch == '\r') {
519+
countNewline(ch);
507520
return;
521+
}
508522
}
509523
}
510524

@@ -516,6 +530,7 @@ class CJSLexer {
516530
pos++;
517531
return;
518532
}
533+
countNewline(ch);
519534
}
520535
}
521536

@@ -527,8 +542,13 @@ class CJSLexer {
527542
if (ch == '\\') {
528543
if (pos + 1 >= end) break;
529544
ch = *++pos;
530-
if (ch == '\r' && *(pos + 1) == '\n')
531-
pos++;
545+
if (ch == '\r') {
546+
++line;
547+
if (*(pos + 1) == '\n')
548+
pos++;
549+
} else if (ch == '\n') {
550+
++line;
551+
}
532552
} else if (isBr(ch))
533553
break;
534554
}
@@ -580,8 +600,12 @@ class CJSLexer {
580600
}
581601
if (ch == '`')
582602
return;
583-
if (ch == '\\' && pos + 1 < end)
603+
if (ch == '\\' && pos + 1 < end) {
584604
pos++;
605+
countNewline(*pos);
606+
} else {
607+
countNewline(ch);
608+
}
585609
}
586610
syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING);
587611
}
@@ -614,7 +638,7 @@ class CJSLexer {
614638
#endif
615639
}
616640

617-
void addExport(std::string_view export_name) {
641+
void addExport(std::string_view export_name, uint32_t at_line) {
618642
// Skip surrounding quotes if present
619643
if (!export_name.empty() && (export_name.front() == '\'' || export_name.front() == '"')) {
620644
export_name.remove_prefix(1);
@@ -625,11 +649,11 @@ class CJSLexer {
625649
if (!needsUnescaping(export_name)) {
626650
// Check if this export already exists (avoid duplicates)
627651
for (const auto& existing : exports) {
628-
if (get_string_view(existing) == export_name) {
652+
if (get_string_view(existing.name) == export_name) {
629653
return; // Already exists, skip
630654
}
631655
}
632-
exports.push_back(export_name);
656+
exports.push_back(export_entry{export_name, at_line});
633657
return;
634658
}
635659

@@ -644,14 +668,14 @@ class CJSLexer {
644668

645669
// Check if this export already exists (avoid duplicates)
646670
for (const auto& existing : exports) {
647-
if (get_string_view(existing) == name) {
671+
if (get_string_view(existing.name) == name) {
648672
return; // Already exists, skip
649673
}
650674
}
651-
exports.push_back(std::move(unescaped.value()));
675+
exports.push_back(export_entry{std::move(unescaped.value()), at_line});
652676
}
653677

654-
void addReexport(std::string_view reexport_name) {
678+
void addReexport(std::string_view reexport_name, uint32_t at_line) {
655679
// Skip surrounding quotes if present
656680
if (!reexport_name.empty() && (reexport_name.front() == '\'' || reexport_name.front() == '"')) {
657681
reexport_name.remove_prefix(1);
@@ -660,7 +684,7 @@ class CJSLexer {
660684

661685
// Fast path: no escaping needed, use string_view directly
662686
if (!needsUnescaping(reexport_name)) {
663-
re_exports.push_back(reexport_name);
687+
re_exports.push_back(export_entry{reexport_name, at_line});
664688
return;
665689
}
666690

@@ -670,7 +694,7 @@ class CJSLexer {
670694
return; // Skip invalid escape sequences
671695
}
672696

673-
re_exports.push_back(std::move(unescaped.value()));
697+
re_exports.push_back(export_entry{std::move(unescaped.value()), at_line});
674698
}
675699

676700
bool readExportsOrModuleDotExports(char ch) {
@@ -712,7 +736,7 @@ class CJSLexer {
712736
switch (requireType) {
713737
case RequireType::ExportStar:
714738
case RequireType::ExportAssign:
715-
addReexport(std::string_view(reexportStart, reexportEnd - reexportStart));
739+
addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line);
716740
return true;
717741
default:
718742
if (starExportStack < STAR_EXPORT_STACK_END) {
@@ -773,7 +797,7 @@ class CJSLexer {
773797
return;
774798
}
775799
}
776-
addExport(std::string_view(startPos, endPos - startPos));
800+
addExport(std::string_view(startPos, endPos - startPos), line);
777801
} else if (ch == '\'' || ch == '"') {
778802
const char* start = pos;
779803
stringLiteral(ch);
@@ -786,7 +810,7 @@ class CJSLexer {
786810
pos = revertPos;
787811
return;
788812
}
789-
addExport(std::string_view(start, end_pos - start));
813+
addExport(std::string_view(start, end_pos - start), line);
790814
}
791815
} else if (ch == '.' && matchesAt(pos + 1, end, "..")) {
792816
pos += 3;
@@ -825,7 +849,7 @@ class CJSLexer {
825849
const char* endPos = pos;
826850
ch = commentWhitespace();
827851
if (ch == '=') {
828-
addExport(std::string_view(startPos, endPos - startPos));
852+
addExport(std::string_view(startPos, endPos - startPos), line);
829853
return;
830854
}
831855
}
@@ -843,7 +867,7 @@ class CJSLexer {
843867
pos++;
844868
ch = commentWhitespace();
845869
if (ch != '=') break;
846-
addExport(std::string_view(startPos, endPos - startPos));
870+
addExport(std::string_view(startPos, endPos - startPos), line);
847871
}
848872
break;
849873
}
@@ -974,7 +998,7 @@ class CJSLexer {
974998
ch = commentWhitespace();
975999
if (ch != ':') break;
9761000
if (exportStart && exportEnd)
977-
addExport(std::string_view(exportStart, exportEnd - exportStart));
1001+
addExport(std::string_view(exportStart, exportEnd - exportStart), line);
9781002
pos = revertPos;
9791003
return;
9801004
} else if (ch == 'g') {
@@ -1042,7 +1066,7 @@ class CJSLexer {
10421066
ch = commentWhitespace();
10431067
if (ch != ')') break;
10441068
if (exportStart && exportEnd)
1045-
addExport(std::string_view(exportStart, exportEnd - exportStart));
1069+
addExport(std::string_view(exportStart, exportEnd - exportStart), line);
10461070
return;
10471071
}
10481072
break;
@@ -1406,7 +1430,7 @@ class CJSLexer {
14061430
StarExportBinding* curCheckBinding = &starExportStack_[0];
14071431
while (curCheckBinding != starExportStack) {
14081432
if (curCheckBinding->id == id) {
1409-
addReexport(curCheckBinding->specifier);
1433+
addReexport(curCheckBinding->specifier, line);
14101434
pos = revertPos;
14111435
return;
14121436
}
@@ -1506,9 +1530,10 @@ class CJSLexer {
15061530
}
15071531

15081532
public:
1509-
CJSLexer(std::vector<export_string>& out_exports, std::vector<export_string>& out_re_exports)
1533+
CJSLexer(std::vector<export_entry>& out_exports, std::vector<export_entry>& out_re_exports)
15101534
: source(nullptr), pos(nullptr), end(nullptr), lastTokenPos(nullptr),
15111535
templateStackDepth(0), openTokenDepth(0), templateDepth(0),
1536+
line(1),
15121537
lastSlashWasDivision(false), nextBraceIsClass(false),
15131538
templateStack_{}, openTokenPosStack_{}, openClassPosStack{},
15141539
starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr),
@@ -1525,6 +1550,7 @@ class CJSLexer {
15251550
templateStackDepth = 0;
15261551
openTokenDepth = 0;
15271552
templateDepth = std::numeric_limits<uint16_t>::max();
1553+
line = 1;
15281554
lastSlashWasDivision = false;
15291555
starExportStack = &starExportStack_[0];
15301556
STAR_EXPORT_STACK_END = &starExportStack_[MAX_STAR_EXPORTS - 1];
@@ -1549,8 +1575,10 @@ class CJSLexer {
15491575
while (pos++ < end) {
15501576
ch = *pos;
15511577

1552-
if (ch == ' ' || (ch < 14 && ch > 8))
1578+
if (ch == ' ' || (ch < 14 && ch > 8)) {
1579+
countNewline(ch);
15531580
continue;
1581+
}
15541582

15551583
if (openTokenDepth == 0) {
15561584
switch (ch) {

0 commit comments

Comments
 (0)