Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The lexer outlined in this document is a simple lexer that is capable of tokeniz
- Comments: `//`

---
## Paser *so far*
## Parser *so far*
The parser is the second step in the compilation process and it is the second part of the A_Language compiler to be implemented. To this point the parser is capable of parsing the following:
- Function declarations *no parameters so far*
- Type declarations: `void`, custom types
Expand All @@ -40,7 +40,6 @@ The grammer rules for the A_Language are written in BNF form. The grammar rules
- The lexer assumes that the input file is an ascii file.
- The lexer assumes that the input file is a valid A_Language program. It does not check for syntax errors yet.
- The lexer does not recognize the following tokens:
- `+=`, `-=`, `*=`, `/=`, `++`, `--`
- `+=`, `-=`, `*=`, `/=`, `++`, `--`
- `<<`, `>>`, `^`, `~`, `->`, `::`
- `<<=`, `>>=`, `&=`, `|=`, `^=`
Expand Down
72 changes: 64 additions & 8 deletions inc/lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ struct SourceFile {
std::string buffer;

void fill_the_buffer(){
std::filesystem::path _path {path};
if (!std::filesystem::exists(path)) {
std::filesystem::path fs_path {path}; // Changed variable name for clarity
if (!std::filesystem::exists(fs_path)) { // Use fs_path
throw std::runtime_error(std::string(path) + std::string(" file not existing.\n"));
}
std::ifstream source_code{_path};
std::ifstream source_code{fs_path}; // Use fs_path
if (!source_code) {
throw std::runtime_error(std::string(path) + std::string(" failed to open.\n"));
}
Expand Down Expand Up @@ -155,16 +155,72 @@ class Lexer {
Lexer(const SourceFile &source)
: source_file(std::make_unique<SourceFile>(source))
{
line_start_idx.push_back(0);
for(int i =0; i < source_file->buffer.size(); i++) {
if (source_file->buffer[i] == '\n') {
line_start_idx.push_back(i);
line_start_idx.push_back(0); // Always start with 0 for the first line.
if (!source_file->buffer.empty()) {
for(size_t i = 0; i < source_file->buffer.size(); ++i) { // Change loop variable type to size_t
if (source_file->buffer[i] == '\n') {
line_start_idx.push_back(i + 1); // Next line starts after the newline
}
}
// Add the end of the buffer as the end of the last line,
// but only if the buffer is not empty and the last character wasn't a newline (already handled).
if (line_start_idx.back() != source_file->buffer.size() && source_file->buffer.back() != '\n') {
line_start_idx.push_back(source_file->buffer.size());
}
// If the file ends with a newline, the last push_back(i+1) would already be buffer.size()
// if the buffer is not empty and the last char is \n, then line_start_idx.back() should be buffer.size().
// if line_start_idx.back() is i+1, and i is buffer.size() -1, then line_start_idx.back() is buffer.size().
}
line_start_idx.push_back(source_file->buffer.size() - 1);
}
Token GetNextToken();
std::string_view GetLine(size_t line_number) const;
};

// Helper function to convert TokenType to string
inline std::string_view to_string(TokenType type) {
switch (type) {
case TokenType::Unknown: return "Unknown";
case TokenType::_EOF: return "_EOF";
case TokenType::Slash: return "Slash";
case TokenType::SlashSlash: return "SlashSlash";
case TokenType::Equal: return "Equal";
case TokenType::EqualEqual: return "EqualEqual";
case TokenType::Ampersand: return "Ampersand";
case TokenType::AmpersandAmpersand: return "AmpersandAmpersand";
case TokenType::Pipe: return "Pipe";
case TokenType::PipePipe: return "PipePipe";
case TokenType::Identifier: return "Identifier";
case TokenType::Int_Number: return "Int_Number";
case TokenType::Float_Number: return "Float_Number";
case TokenType::String: return "String";
case TokenType::Keyword_Function: return "Keyword_Function";
case TokenType::Keyword_Number: return "Keyword_Number";
case TokenType::Keyword_Void: return "Keyword_Void";
case TokenType::Keyword_If: return "Keyword_If";
case TokenType::Keyword_Else: return "Keyword_Else";
case TokenType::Keyword_While: return "Keyword_While";
case TokenType::Keyword_Return: return "Keyword_Return";
case TokenType::Keyword_const: return "Keyword_const";
case TokenType::Keyword_String: return "Keyword_String";
case TokenType::Keyword_Var: return "Keyword_Var";
case TokenType::Keyword_Int: return "Keyword_Int";
case TokenType::Keyword_Float: return "Keyword_Float";
case TokenType::EOL: return "EOL";
case TokenType::LeftParenthesis: return "LeftParenthesis";
case TokenType::RightParenthesis: return "RightParenthesis";
case TokenType::LeftBrace: return "LeftBrace";
case TokenType::RightBrace: return "RightBrace";
case TokenType::Colon: return "Colon";
case TokenType::Semicolon: return "Semicolon";
case TokenType::Comma: return "Comma";
case TokenType::Plus: return "Plus";
case TokenType::Minus: return "Minus";
case TokenType::Asterisk: return "Asterisk";
case TokenType::LowerThan: return "LowerThan";
case TokenType::GreaterThan: return "GreaterThan";
case TokenType::Exclamation: return "Exclamation";
default: return "InvalidTokenType";
}
}

} // namespace AL
4 changes: 3 additions & 1 deletion inc/parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ class Parser{

void PopNextToken() { next_token = lexer->GetNextToken(); }

// Error reporting and recovery
void ReportError(const Token& error_token, const std::string& expectation_message);
void SyncOn(TokenType type) {
incomplete_AST = true;
// incomplete_AST is set by ReportError, which should be called before SyncOn
while(next_token.type != type && next_token.type != TokenType::_EOF)
PopNextToken();
}
Expand Down
57 changes: 48 additions & 9 deletions src/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,56 @@ Token Lexer::GetNextToken() {
}

std::string_view Lexer::GetLine(size_t line_number) const {
assert(line_number > 0);
assert(line_number > 0); // Line numbers are 1-based

int start = line_start_idx[line_number - 1];
int end = 0;
if (line_number >= line_start_idx.size()) end = *(line_start_idx.end() - 1);
else end = line_start_idx[line_number];
// line_start_idx contains start indices of lines.
// line_start_idx[0] is start of line 1.
// line_start_idx[k] is start of line k+1 (0-indexed k).
// The last element of line_start_idx is source_file->buffer.size().
// Number of actual lines = line_start_idx.size() - 1 (if > 0), or 1 if buffer is empty (line_start_idx = [0]).

if (line_start_idx.empty()) { // Should not happen if constructor ran
return {};
}

if (line_number == 0 || line_number > (line_start_idx.size() - (line_start_idx.back() == source_file->buffer.size() ? 1:0) ) ) {
// If line_start_idx.back() is buffer.size(), then .size()-1 is the number of lines.
// Example: {"Hello\nWorld"} -> line_start_idx = [0, 6, 11], size=3. Lines = 2. Max line_number = 2.
// line_number > 3-1 -> line_number > 2.
// Example: {"Hello"} -> line_start_idx = [0, 5], size=2. Lines = 1. Max line_number = 1.
// line_number > 2-1 -> line_number > 1.
// Example: {""} -> line_start_idx = [0], size=1. Lines = 1 (empty line). Max line_number = 1.
// line_number > 1-0 -> line_number > 1. (This needs to be line_number > (line_start_idx.size() == 1 ? 1 : line_start_idx.size() -1)
if (source_file->buffer.empty() && line_number == 1 && line_start_idx.size() == 1 && line_start_idx[0] == 0) {
return ""; // Special case: line 1 of an empty file is empty.
}
// A more robust check for valid line numbers:
// A line number is valid if it's >= 1 and <= number of lines.
// Number of lines can be considered line_start_idx.size() - 1, unless the buffer is empty (1 line).
size_t num_lines = source_file->buffer.empty() ? 1 : line_start_idx.size() - 1;
if (line_number == 0 || line_number > num_lines) {
return {}; // Invalid line number
}
}


size_t start_idx = line_start_idx[line_number - 1];
// end_idx is the start of the next line, or buffer.size() if it's the last line.
size_t end_idx = line_start_idx[line_number];

size_t length = end_idx - start_idx;

// Trim trailing newline characters (\n or \r\n) from the view
if (length > 0) {
if (source_file->buffer[start_idx + length - 1] == '\n') {
length--;
if (length > 0 && source_file->buffer[start_idx + length - 1] == '\r') {
length--;
}
}
}

if (end == start) return {};
return std::string_view(
source_file->buffer.data() + start + (line_number != 1),
end - start - (line_number != 1));
return std::string_view(source_file->buffer.data() + start_idx, length);
}

} // namespace AL
28 changes: 21 additions & 7 deletions src/main.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <iostream>
#include <filesystem>
#include "lexer.hpp"
#include "parser.hpp" // Include parser.hpp
#include <sstream>
#include <fstream>
#include "utils.hpp"
Expand All @@ -10,14 +11,27 @@ int main(int argc, char *argv[]) {
std::filesystem::path path{argv[1]};
AL::SourceFile src{path.c_str(), ""};
AL::Lexer L1(src);

AL::Token current_token;
while (AL::TokenType::_EOF != (current_token = L1.GetNextToken()).type){
// std::cout << to_string(current_token.type) << "\t\t";
if (current_token.value != std::nullopt) {
std::cout << current_token.value.value() << '\t';
AL::Parser parser(L1); // Create an instance of the Parser

auto [ast_nodes, incomplete_ast] = parser.ParseSourceFile(); // Call ParseSourceFile()

if (incomplete_ast) { // Check the incomplete_ast flag
std::cerr << "Parsing failed or was incomplete." << std::endl;
} else {
// Iterate through ast_nodes and call dump()
for (const auto& node : ast_nodes) {
node->dump();
}
std::cout << std::endl;
}

// Remove or comment out the existing loop that iterates through tokens
// AL::Token current_token;
// while (AL::TokenType::_EOF != (current_token = L1.GetNextToken()).type){
// // std::cout << to_string(current_token.type) << "\t\t";
// if (current_token.value != std::nullopt) {
// std::cout << current_token.value.value() << '\t';
// }
// std::cout << std::endl;
// }
return 0;
}
53 changes: 33 additions & 20 deletions src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ std::optional<Type> Parser::ParseType(){
return ret_type;
}

report(next_token.loc, "Expected type specifier here", lexer->GetLine(next_token.loc.line));
ReportError(next_token, "Expected type specifier (e.g., 'void' or identifier).");
return std::nullopt;
}

Expand All @@ -33,8 +33,10 @@ std::unique_ptr<Block> Parser::ParseBlock(){
PopNextToken(); // pop '{'

if (next_token.type != TokenType::RightBrace) {
return report(next_token.loc, "expected a '}' at the end of the block body",
lexer->GetLine(next_token.loc.line));
ReportError(next_token, "Expected '}' at the end of the block body.");
// Attempt to sync to the next potential function or EOF to allow further parsing.
// SyncOn(TokenType::Keyword_Function); // Or a more suitable token for recovery here
return nullptr; // Indicate parsing failure for this block
}
PopNextToken(); // pop '}'

Expand All @@ -49,39 +51,48 @@ std::unique_ptr<FunctionDeclaration> Parser::ParseFunctionDeclaration(){
PopNextToken(); // pop the func keyword

// check the <identifier>
if (next_token.type != TokenType::Identifier)
return report(next_token.loc, "Expected identifier here", lexer->GetLine(next_token.loc.line));

if (next_token.type != TokenType::Identifier) {
ReportError(next_token, "Expected function name (identifier).");
return nullptr;
}
// check if the <identifier> option has value
assert((next_token.value != std::nullopt) && "Identifier token without value");
std::string function_identifier = next_token.value.value();
PopNextToken(); // pop <identifier>

// chech the '('
if(next_token.type != TokenType::LeftParenthesis)
return report(next_token.loc, "Expected a '(' here", lexer->GetLine(next_token.loc.line));
if(next_token.type != TokenType::LeftParenthesis) {
ReportError(next_token, "Expected '(' after function name.");
return nullptr;
}
PopNextToken(); // pop the '('

// chech the ')'
if(next_token.type != TokenType::RightParenthesis)
return report(next_token.loc, "Expected a ')' here", lexer->GetLine(next_token.loc.line));
if(next_token.type != TokenType::RightParenthesis) {
ReportError(next_token, "Expected ')' after function parameters.");
return nullptr;
}
PopNextToken(); // pop the ')'

// check the ':'
if(next_token.type != TokenType::Colon)
return report(next_token.loc, "Expected a ':' here", lexer->GetLine(next_token.loc.line));
if(next_token.type != TokenType::Colon) {
ReportError(next_token, "Expected ':' before function return type.");
return nullptr;
}
PopNextToken(); // pop the ':'

// check for type
std::optional<Type> type_resolved = ParseType();
if (type_resolved == std::nullopt) {
// ParseType already called ReportError
return nullptr;
}

// chech the '{'
if(next_token.type != TokenType::LeftBrace)
return report(next_token.loc, "Expected function body here", lexer->GetLine(next_token.loc.line));

if(next_token.type != TokenType::LeftBrace) {
ReportError(next_token, "Expected '{' to start function body.");
return nullptr;
}
// check for body
std::unique_ptr<Block> body_resolved = (ParseBlock());
if (body_resolved == nullptr) {
Expand All @@ -102,22 +113,24 @@ Parser::ParseSourceFile() {

while(next_token.type != TokenType::_EOF){
if(next_token.type != TokenType::Keyword_Function){
report(next_token.loc, "Only functions are allowed on top level", lexer->GetLine(next_token.loc.line));

SyncOn(TokenType::Keyword_Function);
ReportError(next_token, "Expected 'func' keyword for function declaration at top level.");
SyncOn(TokenType::Keyword_Function); // Try to find next function
if(next_token.type == TokenType::_EOF) break; // Stop if EOF reached after sync
continue;
}

std::unique_ptr<FunctionDeclaration> func = ParseFunctionDeclaration();
if(func == nullptr) {
SyncOn(TokenType::Keyword_Function);
// ParseFunctionDeclaration or one of its sub-parsers already called ReportError
SyncOn(TokenType::Keyword_Function); // Try to find next function
if(next_token.type == TokenType::_EOF) break; // Stop if EOF reached after sync
continue;
}

function_declarations.emplace_back(std::move(func));
}

return {std::move(function_declarations), !incomplete_AST};
return {std::move(function_declarations), incomplete_AST}; // Return incomplete_AST directly
}

} // namespace AL
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set(test A_Language_tests)

set(test_sources
lexer_test.cpp
parser_test.cpp
)

add_executable(${test} ${test_sources})
Expand Down
34 changes: 34 additions & 0 deletions test/parser_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include "gtest/gtest.h"
#include "lexer.hpp"
#include "parser.hpp"
#include "ast.hpp" // For FunctionDeclaration
#include <vector>
#include <memory> // For std::unique_ptr

TEST(ParserTest, HandlesSimpleFunction) {
// 1. Prepare the source code
AL::SourceFile source_file("test/sample_parser_0.AL", ""); // Let it load from file

// 2. Lex the source code
AL::Lexer lexer(source_file);

// 3. Parse the tokens
AL::Parser parser(lexer);
auto [ast_nodes, incomplete_ast] = parser.ParseSourceFile();

// 4. Assertions
ASSERT_FALSE(incomplete_ast) << "Parsing failed or was incomplete.";

ASSERT_FALSE(ast_nodes.empty()) << "No AST nodes were produced.";

ASSERT_NE(ast_nodes[0], nullptr) << "First AST node is null.";

// Check if it's a FunctionDeclaration (optional, but good)
// Dynamic cast to check type, ensure ast.hpp is included
AL::FunctionDeclaration* func_decl = dynamic_cast<AL::FunctionDeclaration*>(ast_nodes[0].get());
ASSERT_NE(func_decl, nullptr) << "First AST node is not a FunctionDeclaration.";

// Check function name
EXPECT_EQ(func_decl->identifier, "main");
EXPECT_EQ(func_decl->type.type_name, "void");
}
1 change: 1 addition & 0 deletions test/sample_parser_0.AL
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
func main() : void {}
Loading